diff --git a/sys/contrib/openzfs/cmd/dbufstat.in b/sys/contrib/openzfs/cmd/dbufstat.in
index 08c22864e5d8..1252496577bc 100755
--- a/sys/contrib/openzfs/cmd/dbufstat.in
+++ b/sys/contrib/openzfs/cmd/dbufstat.in
@@ -1,684 +1,686 @@
 #!/usr/bin/env @PYTHON_SHEBANG@
 #
 # Print out statistics for all cached dmu buffers.  This information
 # is available through the dbufs kstat and may be post-processed as
 # needed by the script.
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License, Version 1.0 only
 # (the "License").  You may not use this file except in compliance
 # with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 # Copyright (C) 2013 Lawrence Livermore National Security, LLC.
 # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 #
 # This script must remain compatible with and Python 3.6+.
 #
 
 import sys
 import getopt
 import errno
 import re
 
 bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
 bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
-         "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
+         "usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
          "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
          "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
          "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
 bincompat = ["cached", "direct", "indirect", "bonus", "spill"]
 
 dhdr = ["pool", "objset", "object", "dtype", "cached"]
 dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
          "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
          "indirect", "bonus", "spill"]
-dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
-             "dbc", "list", "atype", "flags", "count", "asize", "access",
-             "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
-             "l2_comp", "aholds"]
+dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state",
+             "dbholds", "dbc", "list", "atype", "flags", "count", "asize",
+             "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
+             "l2_asize", "l2_comp", "aholds"]
 
 thdr = ["pool", "objset", "dtype", "cached"]
 txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
          "bonus", "spill"]
-tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
-             "dbc", "dbholds", "list", "atype", "flags", "count", "asize",
-             "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
+tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta",
+             "state", "dbc", "dbholds", "list", "atype", "flags", "count",
+             "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
              "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
              "bsize", "lvls", "dholds", "blocks", "dsize"]
 
 cols = {
     # hdr:        [size, scale, description]
     "pool":       [15,   -1, "pool name"],
     "objset":     [6,    -1, "dataset identification number"],
     "object":     [10,   -1, "object number"],
     "level":      [5,    -1, "indirection level of buffer"],
     "blkid":      [8,    -1, "block number of buffer"],
     "offset":     [12, 1024, "offset in object of buffer"],
     "dbsize":     [7,  1024, "size of buffer"],
+    "usize":      [7,  1024, "size of attached user data"],
     "meta":       [4,    -1, "is this buffer metadata?"],
     "state":      [5,    -1, "state of buffer (read, cached, etc)"],
     "dbholds":    [7,  1000, "number of holds on buffer"],
     "dbc":        [3,    -1, "in dbuf cache"],
     "list":       [4,    -1, "which ARC list contains this buffer"],
     "atype":      [7,    -1, "ARC header type (data or metadata)"],
     "flags":      [9,    -1, "ARC read flags"],
     "count":      [5,    -1, "ARC data count"],
     "asize":      [7,  1024, "size of this ARC buffer"],
     "access":     [10,   -1, "time this ARC buffer was last accessed"],
     "mru":        [5,  1000, "hits while on the ARC's MRU list"],
     "gmru":       [5,  1000, "hits while on the ARC's MRU ghost list"],
     "mfu":        [5,  1000, "hits while on the ARC's MFU list"],
     "gmfu":       [5,  1000, "hits while on the ARC's MFU ghost list"],
     "l2":         [5,  1000, "hits while on the L2ARC"],
     "l2_dattr":   [8,    -1, "L2ARC disk address/offset"],
     "l2_asize":   [8,  1024, "L2ARC alloc'd size (depending on compression)"],
     "l2_comp":    [21,   -1, "L2ARC compression algorithm for buffer"],
     "aholds":     [6,  1000, "number of holds on this ARC buffer"],
     "dtype":      [27,   -1, "dnode type"],
     "btype":      [27,   -1, "bonus buffer type"],
     "data_bs":    [7,  1024, "data block size"],
     "meta_bs":    [7,  1024, "metadata block size"],
     "bsize":      [6,  1024, "bonus buffer size"],
     "lvls":       [6,    -1, "number of indirection levels"],
     "dholds":     [6,  1000, "number of holds on dnode"],
     "blocks":     [8,  1000, "number of allocated blocks"],
     "dsize":      [12, 1024, "size of dnode"],
     "cached":     [6,  1024, "bytes cached for all blocks"],
     "direct":     [6,  1024, "bytes cached for direct blocks"],
     "indirect":   [8,  1024, "bytes cached for indirect blocks"],
     "bonus":      [5,  1024, "bytes cached for bonus buffer"],
     "spill":      [5,  1024, "bytes cached for spill block"],
 }
 
 hdr = None
 xhdr = None
 sep = "  "  # Default separator is 2 spaces
 cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] "
        "[-s string] [-F filter]\n")
 raw = 0
 
 
 if sys.platform.startswith("freebsd"):
     import io
     # Requires py-sysctl on FreeBSD
     import sysctl
 
     def default_ifile():
         dbufs = sysctl.filter("kstat.zfs.misc.dbufs")[0].value
         sys.stdin = io.StringIO(dbufs)
         return "-"
 
 elif sys.platform.startswith("linux"):
     def default_ifile():
         return "/proc/spl/kstat/zfs/dbufs"
 
 
 def print_incompat_helper(incompat):
     cnt = 0
     for key in sorted(incompat):
         if cnt == 0:
             sys.stderr.write("\t")
         elif cnt > 8:
             sys.stderr.write(",\n\t")
             cnt = 0
         else:
             sys.stderr.write(", ")
 
         sys.stderr.write("%s" % key)
         cnt += 1
 
     sys.stderr.write("\n\n")
 
 
 def detailed_usage():
     sys.stderr.write("%s\n" % cmd)
 
     sys.stderr.write("Field definitions incompatible with '-b' option:\n")
     print_incompat_helper(bincompat)
 
     sys.stderr.write("Field definitions incompatible with '-d' option:\n")
     print_incompat_helper(dincompat)
 
     sys.stderr.write("Field definitions incompatible with '-t' option:\n")
     print_incompat_helper(tincompat)
 
     sys.stderr.write("Field definitions are as follows:\n")
     for key in sorted(cols.keys()):
         sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
     sys.stderr.write("\n")
 
     sys.exit(0)
 
 
 def usage():
     sys.stderr.write("%s\n" % cmd)
     sys.stderr.write("\t -b : Print table of information for each dbuf\n")
     sys.stderr.write("\t -d : Print table of information for each dnode\n")
     sys.stderr.write("\t -h : Print this help message\n")
     sys.stderr.write("\t -n : Exclude header from output\n")
     sys.stderr.write("\t -r : Print raw values\n")
     sys.stderr.write("\t -t : Print table of information for each dnode type"
                      "\n")
     sys.stderr.write("\t -v : List all possible field headers and definitions"
                      "\n")
     sys.stderr.write("\t -x : Print extended stats\n")
     sys.stderr.write("\t -i : Redirect input from the specified file\n")
     sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
     sys.stderr.write("\t -o : Redirect output to the specified file\n")
     sys.stderr.write("\t -s : Override default field separator with custom "
                      "character or string\n")
     sys.stderr.write("\t -F : Filter output by value or regex\n")
     sys.stderr.write("\nExamples:\n")
     sys.stderr.write("\tdbufstat -d -o /tmp/d.log\n")
     sys.stderr.write("\tdbufstat -t -s \",\" -o /tmp/t.log\n")
     sys.stderr.write("\tdbufstat -v\n")
     sys.stderr.write("\tdbufstat -d -f pool,object,objset,dsize,cached\n")
     sys.stderr.write("\tdbufstat -bx -F dbc=1,objset=54,pool=testpool\n")
     sys.stderr.write("\n")
 
     sys.exit(1)
 
 
 def prettynum(sz, scale, num=0):
     global raw
 
     suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
     index = 0
     save = 0
 
     if raw or scale == -1:
         return "%*s" % (sz, num)
 
     # Rounding error, return 0
     elif 0 < num < 1:
         num = 0
 
     while num > scale and index < 5:
         save = num
         num = num / scale
         index += 1
 
     if index == 0:
         return "%*d" % (sz, num)
 
     if (save / scale) < 10:
         return "%*.1f%s" % (sz - 1, num, suffix[index])
     else:
         return "%*d%s" % (sz - 1, num, suffix[index])
 
 
 def print_values(v):
     global hdr
     global sep
 
     try:
         for col in hdr:
             sys.stdout.write("%s%s" % (
                 prettynum(cols[col][0], cols[col][1], v[col]), sep))
         sys.stdout.write("\n")
     except IOError as e:
         if e.errno == errno.EPIPE:
             sys.exit(1)
 
 
 def print_header():
     global hdr
     global sep
 
     try:
         for col in hdr:
             sys.stdout.write("%*s%s" % (cols[col][0], col, sep))
         sys.stdout.write("\n")
     except IOError as e:
         if e.errno == errno.EPIPE:
             sys.exit(1)
 
 
 def get_typestring(t):
     ot_strings = [
                     "DMU_OT_NONE",
                     # general:
                     "DMU_OT_OBJECT_DIRECTORY",
                     "DMU_OT_OBJECT_ARRAY",
                     "DMU_OT_PACKED_NVLIST",
                     "DMU_OT_PACKED_NVLIST_SIZE",
                     "DMU_OT_BPOBJ",
                     "DMU_OT_BPOBJ_HDR",
                     # spa:
                     "DMU_OT_SPACE_MAP_HEADER",
                     "DMU_OT_SPACE_MAP",
                     # zil:
                     "DMU_OT_INTENT_LOG",
                     # dmu:
                     "DMU_OT_DNODE",
                     "DMU_OT_OBJSET",
                     # dsl:
                     "DMU_OT_DSL_DIR",
                     "DMU_OT_DSL_DIR_CHILD_MAP",
                     "DMU_OT_DSL_DS_SNAP_MAP",
                     "DMU_OT_DSL_PROPS",
                     "DMU_OT_DSL_DATASET",
                     # zpl:
                     "DMU_OT_ZNODE",
                     "DMU_OT_OLDACL",
                     "DMU_OT_PLAIN_FILE_CONTENTS",
                     "DMU_OT_DIRECTORY_CONTENTS",
                     "DMU_OT_MASTER_NODE",
                     "DMU_OT_UNLINKED_SET",
                     # zvol:
                     "DMU_OT_ZVOL",
                     "DMU_OT_ZVOL_PROP",
                     # other; for testing only!
                     "DMU_OT_PLAIN_OTHER",
                     "DMU_OT_UINT64_OTHER",
                     "DMU_OT_ZAP_OTHER",
                     # new object types:
                     "DMU_OT_ERROR_LOG",
                     "DMU_OT_SPA_HISTORY",
                     "DMU_OT_SPA_HISTORY_OFFSETS",
                     "DMU_OT_POOL_PROPS",
                     "DMU_OT_DSL_PERMS",
                     "DMU_OT_ACL",
                     "DMU_OT_SYSACL",
                     "DMU_OT_FUID",
                     "DMU_OT_FUID_SIZE",
                     "DMU_OT_NEXT_CLONES",
                     "DMU_OT_SCAN_QUEUE",
                     "DMU_OT_USERGROUP_USED",
                     "DMU_OT_USERGROUP_QUOTA",
                     "DMU_OT_USERREFS",
                     "DMU_OT_DDT_ZAP",
                     "DMU_OT_DDT_STATS",
                     "DMU_OT_SA",
                     "DMU_OT_SA_MASTER_NODE",
                     "DMU_OT_SA_ATTR_REGISTRATION",
                     "DMU_OT_SA_ATTR_LAYOUTS",
                     "DMU_OT_SCAN_XLATE",
                     "DMU_OT_DEDUP",
                     "DMU_OT_DEADLIST",
                     "DMU_OT_DEADLIST_HDR",
                     "DMU_OT_DSL_CLONES",
                     "DMU_OT_BPOBJ_SUBOBJ"]
     otn_strings = {
                     0x80: "DMU_OTN_UINT8_DATA",
                     0xc0: "DMU_OTN_UINT8_METADATA",
                     0x81: "DMU_OTN_UINT16_DATA",
                     0xc1: "DMU_OTN_UINT16_METADATA",
                     0x82: "DMU_OTN_UINT32_DATA",
                     0xc2: "DMU_OTN_UINT32_METADATA",
                     0x83: "DMU_OTN_UINT64_DATA",
                     0xc3: "DMU_OTN_UINT64_METADATA",
                     0x84: "DMU_OTN_ZAP_DATA",
                     0xc4: "DMU_OTN_ZAP_METADATA",
                     0xa0: "DMU_OTN_UINT8_ENC_DATA",
                     0xe0: "DMU_OTN_UINT8_ENC_METADATA",
                     0xa1: "DMU_OTN_UINT16_ENC_DATA",
                     0xe1: "DMU_OTN_UINT16_ENC_METADATA",
                     0xa2: "DMU_OTN_UINT32_ENC_DATA",
                     0xe2: "DMU_OTN_UINT32_ENC_METADATA",
                     0xa3: "DMU_OTN_UINT64_ENC_DATA",
                     0xe3: "DMU_OTN_UINT64_ENC_METADATA",
                     0xa4: "DMU_OTN_ZAP_ENC_DATA",
                     0xe4: "DMU_OTN_ZAP_ENC_METADATA"}
 
     # If "-rr" option is used, don't convert to string representation
     if raw > 1:
         return "%i" % t
 
     try:
         if t < len(ot_strings):
             return ot_strings[t]
         else:
             return otn_strings[t]
     except (IndexError, KeyError):
         return "(UNKNOWN)"
 
 
 def get_compstring(c):
     comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON",
                     "ZIO_COMPRESS_OFF",     "ZIO_COMPRESS_LZJB",
                     "ZIO_COMPRESS_EMPTY",   "ZIO_COMPRESS_GZIP_1",
                     "ZIO_COMPRESS_GZIP_2",  "ZIO_COMPRESS_GZIP_3",
                     "ZIO_COMPRESS_GZIP_4",  "ZIO_COMPRESS_GZIP_5",
                     "ZIO_COMPRESS_GZIP_6",  "ZIO_COMPRESS_GZIP_7",
                     "ZIO_COMPRESS_GZIP_8",  "ZIO_COMPRESS_GZIP_9",
                     "ZIO_COMPRESS_ZLE",     "ZIO_COMPRESS_LZ4",
                     "ZIO_COMPRESS_ZSTD",    "ZIO_COMPRESS_FUNCTION"]
 
     # If "-rr" option is used, don't convert to string representation
     if raw > 1:
         return "%i" % c
 
     try:
         return comp_strings[c]
     except IndexError:
         return "%i" % c
 
 
 def parse_line(line, labels):
     global hdr
 
     new = dict()
     val = None
     for col in hdr:
         # These are "special" fields computed in the update_dict
         # function, prevent KeyError exception on labels[col] for these.
         if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']:
             val = line[labels[col]]
 
         if col in ['pool', 'flags']:
             new[col] = str(val)
         elif col in ['dtype', 'btype']:
             new[col] = get_typestring(int(val))
         elif col in ['l2_comp']:
             new[col] = get_compstring(int(val))
         else:
             new[col] = int(val)
 
     return new
 
 
 def update_dict(d, k, line, labels):
     pool = line[labels['pool']]
     objset = line[labels['objset']]
     key = line[labels[k]]
 
     dbsize = int(line[labels['dbsize']])
+    usize = int(line[labels['usize']])
     blkid = int(line[labels['blkid']])
     level = int(line[labels['level']])
 
     if pool not in d:
         d[pool] = dict()
 
     if objset not in d[pool]:
         d[pool][objset] = dict()
 
     if key not in d[pool][objset]:
         d[pool][objset][key] = parse_line(line, labels)
         d[pool][objset][key]['bonus'] = 0
         d[pool][objset][key]['cached'] = 0
         d[pool][objset][key]['direct'] = 0
         d[pool][objset][key]['indirect'] = 0
         d[pool][objset][key]['spill'] = 0
 
-    d[pool][objset][key]['cached'] += dbsize
+    d[pool][objset][key]['cached'] += dbsize + usize
 
     if blkid == -1:
         d[pool][objset][key]['bonus'] += dbsize
     elif blkid == -2:
         d[pool][objset][key]['spill'] += dbsize
     else:
         if level == 0:
             d[pool][objset][key]['direct'] += dbsize
         else:
             d[pool][objset][key]['indirect'] += dbsize
 
     return d
 
 
 def skip_line(vals, filters):
     '''
     Determines if a line should be skipped during printing
     based on a set of filters
     '''
     if len(filters) == 0:
         return False
 
     for key in vals:
         if key in filters:
             val = prettynum(cols[key][0], cols[key][1], vals[key]).strip()
             # we want a full match here
             if re.match("(?:" + filters[key] + r")\Z", val) is None:
                 return True
 
     return False
 
 
 def print_dict(d, filters, noheader):
     if not noheader:
         print_header()
     for pool in list(d.keys()):
         for objset in list(d[pool].keys()):
             for v in list(d[pool][objset].values()):
                 if not skip_line(v, filters):
                     print_values(v)
 
 
 def dnodes_build_dict(filehandle):
     labels = dict()
     dnodes = dict()
 
     # First 3 lines are header information, skip the first two
     for i in range(2):
         next(filehandle)
 
     # The third line contains the labels and index locations
     for i, v in enumerate(next(filehandle).split()):
         labels[v] = i
 
     # The rest of the file is buffer information
     for line in filehandle:
         update_dict(dnodes, 'object', line.split(), labels)
 
     return dnodes
 
 
 def types_build_dict(filehandle):
     labels = dict()
     types = dict()
 
     # First 3 lines are header information, skip the first two
     for i in range(2):
         next(filehandle)
 
     # The third line contains the labels and index locations
     for i, v in enumerate(next(filehandle).split()):
         labels[v] = i
 
     # The rest of the file is buffer information
     for line in filehandle:
         update_dict(types, 'dtype', line.split(), labels)
 
     return types
 
 
 def buffers_print_all(filehandle, filters, noheader):
     labels = dict()
 
     # First 3 lines are header information, skip the first two
     for i in range(2):
         next(filehandle)
 
     # The third line contains the labels and index locations
     for i, v in enumerate(next(filehandle).split()):
         labels[v] = i
 
     if not noheader:
         print_header()
 
     # The rest of the file is buffer information
     for line in filehandle:
         vals = parse_line(line.split(), labels)
         if not skip_line(vals, filters):
             print_values(vals)
 
 
 def main():
     global hdr
     global sep
     global raw
 
     desired_cols = None
     bflag = False
     dflag = False
     hflag = False
     ifile = None
     ofile = None
     tflag = False
     vflag = False
     xflag = False
     nflag = False
     filters = dict()
 
     try:
         opts, args = getopt.getopt(
             sys.argv[1:],
             "bdf:hi:o:rs:tvxF:n",
             [
                 "buffers",
                 "dnodes",
                 "columns",
                 "help",
                 "infile",
                 "outfile",
                 "separator",
                 "types",
                 "verbose",
                 "extended",
                 "filter"
             ]
         )
     except getopt.error:
         usage()
         opts = None
 
     for opt, arg in opts:
         if opt in ('-b', '--buffers'):
             bflag = True
         if opt in ('-d', '--dnodes'):
             dflag = True
         if opt in ('-f', '--columns'):
             desired_cols = arg
         if opt in ('-h', '--help'):
             hflag = True
         if opt in ('-i', '--infile'):
             ifile = arg
         if opt in ('-o', '--outfile'):
             ofile = arg
         if opt in ('-r', '--raw'):
             raw += 1
         if opt in ('-s', '--separator'):
             sep = arg
         if opt in ('-t', '--types'):
             tflag = True
         if opt in ('-v', '--verbose'):
             vflag = True
         if opt in ('-x', '--extended'):
             xflag = True
         if opt in ('-n', '--noheader'):
             nflag = True
         if opt in ('-F', '--filter'):
             fils = [x.strip() for x in arg.split(",")]
 
             for fil in fils:
                 f = [x.strip() for x in fil.split("=")]
 
                 if len(f) != 2:
                     sys.stderr.write("Invalid filter '%s'.\n" % fil)
                     sys.exit(1)
 
                 if f[0] not in cols:
                     sys.stderr.write("Invalid field '%s' in filter.\n" % f[0])
                     sys.exit(1)
 
                 if f[0] in filters:
                     sys.stderr.write("Field '%s' specified multiple times in "
                                      "filter.\n" % f[0])
                     sys.exit(1)
 
                 try:
                     re.compile("(?:" + f[1] + r")\Z")
                 except re.error:
                     sys.stderr.write("Invalid regex for field '%s' in "
                                      "filter.\n" % f[0])
                     sys.exit(1)
 
                 filters[f[0]] = f[1]
 
     if hflag or (xflag and desired_cols):
         usage()
 
     if vflag:
         detailed_usage()
 
     # Ensure at most only one of b, d, or t flags are set
     if (bflag and dflag) or (bflag and tflag) or (dflag and tflag):
         usage()
 
     if bflag:
         hdr = bxhdr if xflag else bhdr
     elif tflag:
         hdr = txhdr if xflag else thdr
     else:  # Even if dflag is False, it's the default if none set
         dflag = True
         hdr = dxhdr if xflag else dhdr
 
     if desired_cols:
         hdr = desired_cols.split(",")
 
         invalid = []
         incompat = []
         for ele in hdr:
             if ele not in cols:
                 invalid.append(ele)
             elif ((bflag and bincompat and ele in bincompat) or
                   (dflag and dincompat and ele in dincompat) or
                   (tflag and tincompat and ele in tincompat)):
                     incompat.append(ele)
 
         if len(invalid) > 0:
             sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
             usage()
 
         if len(incompat) > 0:
             sys.stderr.write("Incompatible field specified! -- %s\n" %
                              incompat)
             usage()
 
     if ofile:
         try:
             tmp = open(ofile, "w")
             sys.stdout = tmp
 
         except IOError:
             sys.stderr.write("Cannot open %s for writing\n" % ofile)
             sys.exit(1)
 
     if not ifile:
         ifile = default_ifile()
 
     if ifile != "-":
         try:
             tmp = open(ifile, "r")
             sys.stdin = tmp
         except IOError:
             sys.stderr.write("Cannot open %s for reading\n" % ifile)
             sys.exit(1)
 
     if bflag:
         buffers_print_all(sys.stdin, filters, nflag)
 
     if dflag:
         print_dict(dnodes_build_dict(sys.stdin), filters, nflag)
 
     if tflag:
         print_dict(types_build_dict(sys.stdin), filters, nflag)
 
 
 if __name__ == '__main__':
     main()
diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h
index 1cc8b8971a2d..bddf395df7ee 100644
--- a/sys/contrib/openzfs/include/sys/dmu.h
+++ b/sys/contrib/openzfs/include/sys/dmu.h
@@ -1,1103 +1,1116 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/inttypes.h>
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_priority.h>
 #include <sys/uio.h>
 #include <sys/zfs_file.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
 struct sa_handle;
 struct dsl_crypto_params;
 struct locked_range;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 typedef struct dnode dnode_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
 	DMU_BSWAP_UINT16,
 	DMU_BSWAP_UINT32,
 	DMU_BSWAP_UINT64,
 	DMU_BSWAP_ZAP,
 	DMU_BSWAP_DNODE,
 	DMU_BSWAP_OBJSET,
 	DMU_BSWAP_ZNODE,
 	DMU_BSWAP_OLDACL,
 	DMU_BSWAP_ACL,
 	/*
 	 * Allocating a new byteswap type number makes the on-disk format
 	 * incompatible with any other format that uses the same number.
 	 *
 	 * Data can usually be structured to work with one of the
 	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
 	 */
 	DMU_BSWAP_NUMFUNCS
 } dmu_object_byteswap_t;
 
 #define	DMU_OT_NEWTYPE 0x80
 #define	DMU_OT_METADATA 0x40
 #define	DMU_OT_ENCRYPTED 0x20
 #define	DMU_OT_BYTESWAP_MASK 0x1f
 
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
  * (dmu_object_byteswap_t). All of the types created by this method
  * are cached in the dbuf metadata cache.
  */
 #define	DMU_OT(byteswap, metadata, encrypted) \
 	(DMU_OT_NEWTYPE | \
 	((metadata) ? DMU_OT_METADATA : 0) | \
 	((encrypted) ? DMU_OT_ENCRYPTED : 0) | \
 	((byteswap) & DMU_OT_BYTESWAP_MASK))
 
 #define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
 	(ot) < DMU_OT_NUMTYPES)
 
 #define	DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
 
 /*
  * MDB doesn't have dmu_ot; it defines these macros itself.
  */
 #ifndef ZFS_MDB
 #define	DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata)
 #define	DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt)
 #define	DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap)
 #endif
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	(((ot) & DMU_OT_METADATA) != 0) : \
 	DMU_OT_IS_METADATA_IMPL(ot))
 
 #define	DMU_OT_IS_DDT(ot) \
 	((ot) == DMU_OT_DDT_ZAP)
 
 #define	DMU_OT_IS_CRITICAL(ot) \
 	(DMU_OT_IS_METADATA(ot) && \
 	(ot) != DMU_OT_DNODE && \
 	(ot) != DMU_OT_DIRECTORY_CONTENTS && \
 	(ot) != DMU_OT_SA)
 
 /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
 #define	DMU_OT_IS_FILE(ot) \
 	((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
 
 #define	DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	(((ot) & DMU_OT_ENCRYPTED) != 0) : \
 	DMU_OT_IS_ENCRYPTED_IMPL(ot))
 
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
  * is repurposed for embedded BPs.
  */
 #define	DMU_OT_HAS_FILL(ot) \
 	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
 
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	DMU_OT_BYTESWAP_IMPL(ot))
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPOBJ,			/* UINT64 */
 	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 	DMU_OT_DSL_PERMS,		/* ZAP */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_SA,			/* System attr */
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_DEADLIST,		/* ZAP */
 	DMU_OT_DEADLIST_HDR,		/* UINT64 */
 	DMU_OT_DSL_CLONES,		/* ZAP */
 	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	/*
 	 * Do not allocate new object types here. Doing so makes the on-disk
 	 * format incompatible with any other format that uses the same object
 	 * type number.
 	 *
 	 * When creating an object which does not have one of the above types
 	 * use the DMU_OTN_* type with the correct byteswap and metadata
 	 * values.
 	 *
 	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
 	 * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
 	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
 	 * and DMU_OTN_* types).
 	 */
 	DMU_OT_NUMTYPES,
 
 	/*
 	 * Names for valid types declared with DMU_OT().
 	 */
 	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE),
 	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE),
 	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE),
 	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE),
 	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE),
 	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE),
 	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE),
 	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE),
 	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE),
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE),
 
 	DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE),
 	DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE),
 	DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE),
 	DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE),
 	DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE),
 	DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE),
 	DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE),
 	DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE),
 	DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE),
 	DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE),
 } dmu_object_type_t;
 
 /*
  * These flags are intended to be used to specify the "txg_how"
  * parameter when calling the dmu_tx_assign() function. See the comment
  * above dmu_tx_assign() for more details on the meaning of these flags.
  */
 #define	TXG_NOWAIT	(0ULL)
 #define	TXG_WAIT	(1ULL<<0)
 #define	TXG_NOTHROTTLE	(1ULL<<1)
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 #define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 #define	DMU_PROJECTUSED_OBJECT	(-3ULL)
 
 /*
  * Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT.
  */
 #define	DMU_OBJACCT_PREFIX	"obj-"
 #define	DMU_OBJACCT_PREFIX_LEN	4
 
 /*
  * artificial blkids for bonus buffer and spill blocks
  */
 #define	DMU_BONUS_BLKID		(-1ULL)
 #define	DMU_SPILL_BLKID		(-2ULL)
 
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg,
     cred_t *cr, dmu_tx_t *tx);
 
 int dmu_objset_hold(const char *name, const void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, boolean_t key_required, const void *tag,
     objset_t **osp);
 void dmu_objset_rele(objset_t *os, const void *tag);
 void dmu_objset_disown(objset_t *os, boolean_t key_required, const void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func,
     void *arg);
 int dmu_objset_clone(const char *name, const char *origin);
 int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
 #define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_ERRORSCRUB		"error_scrub"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 #define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 #define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
 #define	DMU_POOL_REMOVING		"com.delphix:removing"
 #define	DMU_POOL_OBSOLETE_BPOBJ		"com.delphix:obsolete_bpobj"
 #define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect"
 #define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"
 #define	DMU_POOL_LOG_SPACEMAP_ZAP	"com.delphix:log_spacemap_zap"
 #define	DMU_POOL_DELETED_CLONES		"com.delphix:deleted_clones"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot,
     int blocksize, int indirect_blockshift, dmu_object_type_t bonustype,
     int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag,
     dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
     dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
     int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx);
 int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the number of levels on a dnode. nlevels must be greater than the
  * current number of levels or an EINVAL will be returned.
  */
 int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels,
     dmu_tx_t *tx);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allocated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly
  * to accommodate the change. When calling this function, the caller must
  * ensure that the object's nlevels can sufficiently support the new maxblkid.
  */
 int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx);
 void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx);
 
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_will_dirty()
  * before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release what you hold with dmu_buf_rele().
  *
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
     dmu_buf_t **dbp);
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     uint32_t flags);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
 dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
  * Special spill buffer support used by "SA" framework
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
     dmu_buf_t **dbp);
 int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
     const void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You must not access the dmu_buf_t after releasing
  * what you hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **, int flags);
 int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp);
 int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp);
 int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags);
 int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp, uint32_t flags);
 int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
     dmu_buf_t **dbp);
 /*
  * Add a reference to a dmu buffer that has already been held via
  * dmu_buf_hold() in the current context.
  */
 void dmu_buf_add_ref(dmu_buf_t *db, const void *tag);
 
 /*
  * Attempt to add a reference to a dmu buffer that is in an unknown state,
  * using a pointer that may have been invalidated by eviction processing.
  * The request will succeed if the passed in dbuf still represents the
  * same os/object/blkid, is ineligible for eviction, and has at least
  * one hold by a user other than the syncer.
  */
 boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
     uint64_t blkid, const void *tag);
 
 void dmu_buf_rele(dmu_buf_t *db, const void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 uint64_t dmu_buf_user_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag,
     int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, const void *tag);
 
 typedef void dmu_buf_evict_func_t(void *user_ptr);
 
 /*
  * A DMU buffer user object may be associated with a dbuf for the
  * duration of its lifetime.  This allows the user of a dbuf (client)
  * to attach private data to a dbuf (e.g. in-core only data such as a
  * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
  * when that dbuf has been evicted.  Clients typically respond to the
  * eviction notification by freeing their private data, thus ensuring
  * the same lifetime for both dbuf and private data.
  *
  * The mapping from a dmu_buf_user_t to any client private data is the
  * client's responsibility.  All current consumers of the API with private
  * data embed a dmu_buf_user_t as the first member of the structure for
  * their private data.  This allows conversions between the two types
  * with a simple cast.  Since the DMU buf user API never needs access
  * to the private data, other strategies can be employed if necessary
  * or convenient for the client (e.g. using container_of() to do the
  * conversion for private data that cannot have the dmu_buf_user_t as
  * its first member).
  *
  * Eviction callbacks are executed without the dbuf mutex held or any
  * other type of mechanism to guarantee that the dbuf is still available.
  * For this reason, users must assume the dbuf has already been freed
  * and not reference the dbuf from the callback context.
  *
  * Users requesting "immediate eviction" are notified as soon as the dbuf
  * is only referenced by dirty records (dirties == holds).  Otherwise the
  * notification occurs after eviction processing for the dbuf begins.
  */
 typedef struct dmu_buf_user {
 	/*
 	 * Asynchronous user eviction callback state.
 	 */
 	taskq_ent_t	dbu_tqent;
 
+	/* Size of user data, for inclusion in dbuf_cache accounting. */
+	uint64_t	dbu_size;
+
 	/*
 	 * This instance's eviction function pointers.
 	 *
 	 * dbu_evict_func_sync is called synchronously and then
 	 * dbu_evict_func_async is executed asynchronously on a taskq.
 	 */
 	dmu_buf_evict_func_t *dbu_evict_func_sync;
 	dmu_buf_evict_func_t *dbu_evict_func_async;
 #ifdef ZFS_DEBUG
 	/*
 	 * Pointer to user's dbuf pointer.  NULL for clients that do
 	 * not associate a dbuf with their user data.
 	 *
 	 * The dbuf pointer is cleared upon eviction so as to catch
 	 * use-after-evict bugs in clients.
 	 */
 	dmu_buf_t **dbu_clear_on_evict_dbufp;
 #endif
 } dmu_buf_user_t;
 
 /*
  * Initialize the given dmu_buf_user_t instance with the eviction function
  * evict_func, to be called when the user is evicted.
  *
  * NOTE: This function should only be called once on a given dmu_buf_user_t.
  *       To allow enforcement of this, dbu must already be zeroed on entry.
  */
 static inline void
 dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
     dmu_buf_evict_func_t *evict_func_async,
     dmu_buf_t **clear_on_evict_dbufp __maybe_unused)
 {
 	ASSERT(dbu->dbu_evict_func_sync == NULL);
 	ASSERT(dbu->dbu_evict_func_async == NULL);
 
 	/* must have at least one evict func */
 	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
 	dbu->dbu_evict_func_sync = evict_func_sync;
 	dbu->dbu_evict_func_async = evict_func_async;
 	taskq_init_ent(&dbu->dbu_tqent);
 #ifdef ZFS_DEBUG
 	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
 #endif
 }
 
 /*
  * Attach user data to a dbuf and mark it for normal (when the dbuf's
  * data is cleared or its reference count goes to zero) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Attach user data to a dbuf and mark it for immediate (its dirty and
  * reference counts are equal) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Replace the current user of a dbuf.
  *
  * If given the current user of a dbuf, replaces the dbuf's user with
  * "new_user" and returns the user data pointer that was replaced.
  * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
 void *dmu_buf_replace_user(dmu_buf_t *db,
     dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
 
 /*
  * Remove the specified user data for a DMU buffer.
  *
  * Returns the user that was removed on success, or the current user if
  * another user currently owns the buffer.
  */
 void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
+/*
+ * User data size accounting. This can be used to artifically inflate the size
+ * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough
+ * to satisfy memory reclaim requests. It's not used for anything else, and
+ * defaults to 0.
+ */
+uint64_t dmu_buf_user_size(dmu_buf_t *db);
+void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd);
+void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
+
 /*
  * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
 dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
 void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
 
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
 struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
     const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
 void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
  *
  * dcb_data is a pointer to caller private data that is passed on as a
  * callback parameter. The caller is responsible for properly allocating and
  * freeing it.
  *
  * When registering a callback, the transaction must be already created, but
  * it cannot be committed or aborted. It can be assigned to a txg or not.
  *
  * The callback will be called after the transaction has been safely written
  * to stable storage and will also be called if the dmu_tx is aborted.
  * If there is any error which prevents the transaction from being committed to
  * disk, the callback will be called with a value of error != 0.
  *
  * When multiple callbacks are registered to the transaction, the callbacks
  * will be called in reverse order to let Lustre, the only user of commit
  * callback currently, take the fast path of its commit callback handling.
  */
 typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
     void *dcb_data);
 void dmu_tx_do_callbacks(list_t *cb_list, int error);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size);
 int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 #define	DMU_READ_PREFETCH	0 /* prefetch */
 #define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 #define	DMU_READ_NO_DECRYPT	2 /* don't decrypt */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf, uint32_t flags);
 int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 #ifdef _KERNEL
 int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size);
 int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx);
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx);
 int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx);
 #define	dmu_assign_arcbuf	dmu_assign_arcbuf_by_dbuf
 extern uint_t zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_nblkptr;
 	uint8_t doi_pad[4];
 	uint64_t doi_dnodesize;
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
 	boolean_t		ot_dbuf_metadata_cache;
 	boolean_t		ot_encrypt;
 	const char		*ot_name;
 } dmu_object_type_info_t;
 
 typedef const struct dmu_object_byteswap_info {
 	arc_byteswap_func_t	 ob_func;
 	const char		*ob_name;
 } dmu_object_byteswap_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 extern dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
 void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
  * Like dmu_object_info_from_db, but faster still when you only care about
  * the size.
  */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	uint8_t dds_redacted;
 	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 /*
  * Get the [cm]time for an objset's snapshot dir
  */
 inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern uint64_t dmu_objset_dnodesize(objset_t *os);
 extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
 extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_objset_blksize(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
 extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 typedef struct zfs_file_info {
 	uint64_t zfi_user;
 	uint64_t zfi_group;
 	uint64_t zfi_project;
 	uint64_t zfi_generation;
 } zfs_file_info_t;
 
 typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data,
     struct zfs_file_info *zoi);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     file_info_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 
 /*
  * {zfs,zvol,ztest}_get_done() args
  */
 typedef struct zgd {
 	struct lwb	*zgd_lwb;
 	struct blkptr	*zgd_bp;
 	dmu_buf_t	*zgd_db;
 	struct zfs_locked_range *zgd_lr;
 	void		*zgd_private;
 } zgd_t;
 
 typedef void dmu_sync_cb_t(zgd_t *arg, int error);
 int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, struct blkptr *bps, size_t *nbpsp);
 int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
     boolean_t replay);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
     zfs_file_t *fp, offset_t *offp);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 extern uint_t dmu_prefetch_max;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
diff --git a/sys/contrib/openzfs/lib/libshare/os/freebsd/nfs.c b/sys/contrib/openzfs/lib/libshare/os/freebsd/nfs.c
index d9fc66106369..d4cdb07a4947 100644
--- a/sys/contrib/openzfs/lib/libshare/os/freebsd/nfs.c
+++ b/sys/contrib/openzfs/lib/libshare/os/freebsd/nfs.c
@@ -1,213 +1,210 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 2020, 2022 by Delphix. All rights reserved.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/param.h>
 #include <sys/vfs.h>
 
 #include <assert.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libutil.h>
 #include <signal.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 #include <libintl.h>
 
 #include <libshare.h>
 #include "libshare_impl.h"
 #include "nfs.h"
 
 #define	_PATH_MOUNTDPID	"/var/run/mountd.pid"
 #define	OPTSSIZE	1024
 #define	MAXLINESIZE	(PATH_MAX + OPTSSIZE)
 #define	ZFS_EXPORTS_FILE	"/etc/zfs/exports"
 #define	ZFS_EXPORTS_LOCK	ZFS_EXPORTS_FILE".lock"
 
 /*
  * This function translates options to a format acceptable by exports(5), eg.
  *
  *	-ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 \
  *	zfs.freebsd.org 69.147.83.54
  *
  * Accepted input formats:
  *
  *	ro,network=192.168.0.0,mask=255.255.255.0,maproot=0,zfs.freebsd.org
  *	ro network=192.168.0.0 mask=255.255.255.0 maproot=0 zfs.freebsd.org
  *	-ro,-network=192.168.0.0,-mask=255.255.255.0,-maproot=0,zfs.freebsd.org
  *	-ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 \
  *	zfs.freebsd.org
  *
  * Recognized keywords:
  *
  *	ro, maproot, mapall, mask, network, sec, alldirs, public, webnfs,
  *	index, quiet
  */
 static int
 translate_opts(const char *shareopts, FILE *out)
 {
 	static const char *const known_opts[] = { "ro", "maproot", "mapall",
 	    "mask", "network", "sec", "alldirs", "public", "webnfs", "index",
 	    "quiet" };
 	char oldopts[OPTSSIZE], newopts[OPTSSIZE];
 	char *o, *s = NULL;
 	unsigned int i;
 	size_t len;
 
 	strlcpy(oldopts, shareopts, sizeof (oldopts));
 	newopts[0] = '\0';
 	s = oldopts;
 	while ((o = strsep(&s, "-, ")) != NULL) {
 		if (o[0] == '\0')
 			continue;
 		for (i = 0; i < ARRAY_SIZE(known_opts); ++i) {
 			len = strlen(known_opts[i]);
 			if (strncmp(known_opts[i], o, len) == 0 &&
 			    (o[len] == '\0' || o[len] == '=')) {
 				strlcat(newopts, "-", sizeof (newopts));
 				break;
 			}
 		}
 		strlcat(newopts, o, sizeof (newopts));
 		strlcat(newopts, " ", sizeof (newopts));
 	}
 	return (fputs(newopts, out));
 }
 
 static int
 nfs_enable_share_impl(sa_share_impl_t impl_share, FILE *tmpfile)
 {
 	const char *shareopts = impl_share->sa_shareopts;
 	if (strcmp(shareopts, "on") == 0)
 		shareopts = "";
 
 	boolean_t need_free;
 	char *mp;
 	int rc  = nfs_escape_mountpoint(impl_share->sa_mountpoint, &mp,
 	    &need_free);
 	if (rc != SA_OK)
 		return (rc);
 
 	if (fputs(mp, tmpfile) == EOF ||
 	    fputc('\t', tmpfile) == EOF ||
 	    translate_opts(shareopts, tmpfile) == EOF ||
 	    fputc('\n', tmpfile) == EOF) {
 		fprintf(stderr, "failed to write to temporary file\n");
 		rc = SA_SYSTEM_ERR;
 	}
 
 	if (need_free)
 		free(mp);
 	return (rc);
 }
 
 static int
 nfs_enable_share(sa_share_impl_t impl_share)
 {
 	return (nfs_toggle_share(
 	    ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, NULL, impl_share,
 	    nfs_enable_share_impl));
 }
 
 static int
 nfs_disable_share_impl(sa_share_impl_t impl_share, FILE *tmpfile)
 {
 	(void) impl_share, (void) tmpfile;
 	return (SA_OK);
 }
 
 static int
 nfs_disable_share(sa_share_impl_t impl_share)
 {
 	return (nfs_toggle_share(
 	    ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, NULL, impl_share,
 	    nfs_disable_share_impl));
 }
 
 static boolean_t
 nfs_is_shared(sa_share_impl_t impl_share)
 {
 	return (nfs_is_shared_impl(ZFS_EXPORTS_FILE, impl_share));
 }
 
 static int
 nfs_validate_shareopts(const char *shareopts)
 {
 	if (strlen(shareopts) == 0)
 		return (SA_SYNTAX_ERR);
 	return (SA_OK);
 }
 
 /*
  * Commit the shares by restarting mountd.
  */
 static int
 nfs_commit_shares(void)
 {
 	struct pidfh *pfh;
 	pid_t mountdpid;
 
 start:
 	pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &mountdpid);
 	if (pfh != NULL) {
 		/* mountd(8) is not running. */
 		pidfile_remove(pfh);
 		return (SA_OK);
 	}
 	if (errno != EEXIST) {
 		/* Cannot open pidfile for some reason. */
 		return (SA_SYSTEM_ERR);
 	}
 	if (mountdpid == -1) {
 		/* mountd(8) exists, but didn't write the PID yet */
 		usleep(500);
 		goto start;
 	}
 	/* We have mountd(8) PID in mountdpid variable. */
 	kill(mountdpid, SIGHUP);
 	return (SA_OK);
 }
 
 static void
 nfs_truncate_shares(void)
 {
 	nfs_reset_shares(ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE);
 }
 
 const sa_fstype_t libshare_nfs_type = {
 	.enable_share = nfs_enable_share,
 	.disable_share = nfs_disable_share,
 	.is_shared = nfs_is_shared,
 
 	.validate_shareopts = nfs_validate_shareopts,
 	.commit_shares = nfs_commit_shares,
 	.truncate_shares = nfs_truncate_shares,
 };
diff --git a/sys/contrib/openzfs/lib/libspl/os/freebsd/mnttab.c b/sys/contrib/openzfs/lib/libspl/os/freebsd/mnttab.c
index a4673084ad5f..26a4cd992cfb 100644
--- a/sys/contrib/openzfs/lib/libspl/os/freebsd/mnttab.c
+++ b/sys/contrib/openzfs/lib/libspl/os/freebsd/mnttab.c
@@ -1,237 +1,234 @@
 /*
  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This file implements Solaris compatible getmntany() and hasmntopt()
  * functions.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 
 #include <ctype.h>
 #include <errno.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 static char *
 mntopt(char **p)
 {
 	char *cp = *p;
 	char *retstr;
 
 	while (*cp && isspace(*cp))
 		cp++;
 
 	retstr = cp;
 	while (*cp && *cp != ',')
 		cp++;
 
 	if (*cp) {
 		*cp = '\0';
 		cp++;
 	}
 
 	*p = cp;
 	return (retstr);
 }
 
 char *
 hasmntopt(struct mnttab *mnt, const char *opt)
 {
 	char tmpopts[MNT_LINE_MAX];
 	char *f, *opts = tmpopts;
 
 	if (mnt->mnt_mntopts == NULL)
 		return (NULL);
 	(void) strlcpy(opts, mnt->mnt_mntopts, MNT_LINE_MAX);
 	f = mntopt(&opts);
 	for (; *f; f = mntopt(&opts)) {
 		if (strncmp(opt, f, strlen(opt)) == 0)
 			return (f - tmpopts + mnt->mnt_mntopts);
 	}
 	return (NULL);
 }
 
 static void
 optadd(char *mntopts, size_t size, const char *opt)
 {
 
 	if (mntopts[0] != '\0')
 		strlcat(mntopts, ",", size);
 	strlcat(mntopts, opt, size);
 }
 
 static __thread char gfstypename[MFSNAMELEN];
 static __thread char gmntfromname[MNAMELEN];
 static __thread char gmntonname[MNAMELEN];
 static __thread char gmntopts[MNTMAXSTR];
 
 void
 statfs2mnttab(struct statfs *sfs, struct mnttab *mp)
 {
 	long flags;
 
 	strlcpy(gfstypename, sfs->f_fstypename, sizeof (gfstypename));
 	mp->mnt_fstype = gfstypename;
 
 	strlcpy(gmntfromname, sfs->f_mntfromname, sizeof (gmntfromname));
 	mp->mnt_special = gmntfromname;
 
 	strlcpy(gmntonname, sfs->f_mntonname, sizeof (gmntonname));
 	mp->mnt_mountp = gmntonname;
 
 	flags = sfs->f_flags;
 	gmntopts[0] = '\0';
 #define	OPTADD(opt)	optadd(gmntopts, sizeof (gmntopts), (opt))
 	if (flags & MNT_RDONLY)
 		OPTADD(MNTOPT_RO);
 	else
 		OPTADD(MNTOPT_RW);
 	if (flags & MNT_NOSUID)
 		OPTADD(MNTOPT_NOSETUID);
 	else
 		OPTADD(MNTOPT_SETUID);
 	if (flags & MNT_UPDATE)
 		OPTADD(MNTOPT_REMOUNT);
 	if (flags & MNT_NOATIME)
 		OPTADD(MNTOPT_NOATIME);
 	else
 		OPTADD(MNTOPT_ATIME);
 	OPTADD(MNTOPT_NOXATTR);
 	if (flags & MNT_NOEXEC)
 		OPTADD(MNTOPT_NOEXEC);
 	else
 		OPTADD(MNTOPT_EXEC);
 #undef	OPTADD
 	mp->mnt_mntopts = gmntopts;
 }
 
 static pthread_rwlock_t gsfs_lock = PTHREAD_RWLOCK_INITIALIZER;
 static struct statfs *gsfs = NULL;
 static int allfs = 0;
 
 static int
 statfs_init(void)
 {
 	struct statfs *sfs;
 	int error;
 
 	(void) pthread_rwlock_wrlock(&gsfs_lock);
 
 	if (gsfs != NULL) {
 		free(gsfs);
 		gsfs = NULL;
 	}
 	allfs = getfsstat(NULL, 0, MNT_NOWAIT);
 	if (allfs == -1)
 		goto fail;
 	gsfs = malloc(sizeof (gsfs[0]) * allfs * 2);
 	if (gsfs == NULL)
 		goto fail;
 	allfs = getfsstat(gsfs, (long)(sizeof (gsfs[0]) * allfs * 2),
 	    MNT_NOWAIT);
 	if (allfs == -1)
 		goto fail;
 	sfs = realloc(gsfs, allfs * sizeof (gsfs[0]));
 	if (sfs != NULL)
 		gsfs = sfs;
 	(void) pthread_rwlock_unlock(&gsfs_lock);
 	return (0);
 fail:
 	error = errno;
 	if (gsfs != NULL)
 		free(gsfs);
 	gsfs = NULL;
 	allfs = 0;
 	(void) pthread_rwlock_unlock(&gsfs_lock);
 	return (error);
 }
 
 int
 getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp)
 {
 	int i, error;
 
 	error = statfs_init();
 	if (error != 0)
 		return (error);
 
 	(void) pthread_rwlock_rdlock(&gsfs_lock);
 
 	for (i = 0; i < allfs; i++) {
 		if (mrefp->mnt_special != NULL &&
 		    strcmp(mrefp->mnt_special, gsfs[i].f_mntfromname) != 0) {
 			continue;
 		}
 		if (mrefp->mnt_mountp != NULL &&
 		    strcmp(mrefp->mnt_mountp, gsfs[i].f_mntonname) != 0) {
 			continue;
 		}
 		if (mrefp->mnt_fstype != NULL &&
 		    strcmp(mrefp->mnt_fstype, gsfs[i].f_fstypename) != 0) {
 			continue;
 		}
 		statfs2mnttab(&gsfs[i], mgetp);
 		(void) pthread_rwlock_unlock(&gsfs_lock);
 		return (0);
 	}
 	(void) pthread_rwlock_unlock(&gsfs_lock);
 	return (-1);
 }
 
 int
 getmntent(FILE *fp, struct mnttab *mp)
 {
 	int error, nfs;
 
 	nfs = (int)lseek(fileno(fp), 0, SEEK_CUR);
 	if (nfs == -1)
 		return (errno);
 	/* If nfs is 0, we want to refresh out cache. */
 	if (nfs == 0 || gsfs == NULL) {
 		error = statfs_init();
 		if (error != 0)
 			return (error);
 	}
 	(void) pthread_rwlock_rdlock(&gsfs_lock);
 	if (nfs >= allfs) {
 		(void) pthread_rwlock_unlock(&gsfs_lock);
 		return (-1);
 	}
 	statfs2mnttab(&gsfs[nfs], mp);
 	(void) pthread_rwlock_unlock(&gsfs_lock);
 	if (lseek(fileno(fp), 1, SEEK_CUR) == -1)
 		return (errno);
 	return (0);
 }
diff --git a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c
index 34976f7bbf46..3c50daf471b7 100644
--- a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c
+++ b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c
@@ -1,139 +1,136 @@
 /*
  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This file implements Solaris compatible zmount() function.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <sys/uio.h>
 #include <sys/mntent.h>
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mnttab.h>
 #include <sys/errno.h>
 #include <libzfs.h>
 
 #include "../../libzfs_impl.h"
 
 static void
 build_iovec(struct iovec **iov, int *iovlen, const char *name, void *val,
     size_t len)
 {
 	int i;
 
 	if (*iovlen < 0)
 		return;
 	i = *iovlen;
 	*iov = realloc(*iov, sizeof (**iov) * (i + 2));
 	if (*iov == NULL) {
 		*iovlen = -1;
 		return;
 	}
 	(*iov)[i].iov_base = strdup(name);
 	(*iov)[i].iov_len = strlen(name) + 1;
 	i++;
 	(*iov)[i].iov_base = val;
 	if (len == (size_t)-1) {
 		if (val != NULL)
 			len = strlen(val) + 1;
 		else
 			len = 0;
 	}
 	(*iov)[i].iov_len = (int)len;
 	*iovlen = ++i;
 }
 
 int
 do_mount(zfs_handle_t *zhp, const char *mntpt, const char *opts, int flags)
 {
 	struct iovec *iov;
 	char *optstr, *p, *tofree;
 	int iovlen, rv;
 	const char *spec = zfs_get_name(zhp);
 
 	assert(spec != NULL);
 	assert(mntpt != NULL);
 	assert(opts != NULL);
 
 	tofree = optstr = strdup(opts);
 	assert(optstr != NULL);
 
 	iov = NULL;
 	iovlen = 0;
 	if (strstr(optstr, MNTOPT_REMOUNT) != NULL)
 		build_iovec(&iov, &iovlen, "update", NULL, 0);
 	if (flags & MS_RDONLY)
 		build_iovec(&iov, &iovlen, "ro", NULL, 0);
 	build_iovec(&iov, &iovlen, "fstype", __DECONST(char *, MNTTYPE_ZFS),
 	    (size_t)-1);
 	build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, mntpt),
 	    (size_t)-1);
 	build_iovec(&iov, &iovlen, "from", __DECONST(char *, spec), (size_t)-1);
 	while ((p = strsep(&optstr, ",/")) != NULL)
 		build_iovec(&iov, &iovlen, p, NULL, (size_t)-1);
 	rv = nmount(iov, iovlen, 0);
 	free(tofree);
 	if (rv < 0)
 		return (errno);
 	return (rv);
 
 }
 
 int
 do_unmount(zfs_handle_t *zhp, const char *mntpt, int flags)
 {
 	(void) zhp;
 	if (unmount(mntpt, flags) < 0)
 		return (errno);
 	return (0);
 }
 
 int
 zfs_mount_delegation_check(void)
 {
 	return (0);
 }
 
 /* Called from the tail end of zpool_disable_datasets() */
 void
 zpool_disable_datasets_os(zpool_handle_t *zhp, boolean_t force)
 {
 	(void) zhp, (void) force;
 }
 
 /* Called from the tail end of zfs_unmount() */
 void
 zpool_disable_volume_os(const char *name)
 {
 	(void) name;
 }
diff --git a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_setproctitle.c b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_setproctitle.c
index 4a6d12cf70cf..5961527ebc2c 100644
--- a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_setproctitle.c
+++ b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_setproctitle.c
@@ -1,299 +1,276 @@
 /*
  * Copyright © 2013 Guillem Jover <guillem@hadrons.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
  * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <errno.h>
 #include <stddef.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <err.h>
 #include <unistd.h>
 #include <string.h>
 #include <sys/param.h>
 #include <libzutil.h>
 
 static struct {
 	/* Original value. */
 	const char *arg0;
 
 	/* Title space available. */
 	char *base, *end;
 
 	/* Pointer to original nul character within base. */
 	char *nul;
 
 	boolean_t warned;
 	boolean_t reset;
 	int error;
 } SPT;
 
 #define	LIBBSD_IS_PATHNAME_SEPARATOR(c) ((c) == '/')
 #define	SPT_MAXTITLE 255
 
 extern const char *__progname;
 
 static const char *
 getprogname(void)
 {
 	return (__progname);
 }
 
 static void
 setprogname(const char *progname)
 {
 	size_t i;
 
 	for (i = strlen(progname); i > 0; i--) {
 		if (LIBBSD_IS_PATHNAME_SEPARATOR(progname[i - 1])) {
 			__progname = progname + i;
 			return;
 		}
 	}
 	__progname = progname;
 }
 
 
 static inline size_t
 spt_min(size_t a, size_t b)
 {
 	return ((a < b) ? a : b);
 }
 
-/*
- * For discussion on the portability of the various methods, see
- * https://lists.freebsd.org/pipermail/freebsd-stable/2008-June/043136.html
- */
-static int
-spt_clearenv(void)
-{
-	char **tmp;
-
-	tmp = malloc(sizeof (*tmp));
-	if (tmp == NULL)
-		return (errno);
-
-	tmp[0] = NULL;
-	environ = tmp;
-
-	return (0);
-}
-
 static int
 spt_copyenv(int envc, char *envp[])
 {
 	char **envcopy;
 	char *eq;
 	int envsize;
 	int i, error;
 
 	if (environ != envp)
 		return (0);
 
 	/*
 	 * Make a copy of the old environ array of pointers, in case
 	 * clearenv() or setenv() is implemented to free the internal
 	 * environ array, because we will need to access the old environ
 	 * contents to make the new copy.
 	 */
 	envsize = (envc + 1) * sizeof (char *);
 	envcopy = malloc(envsize);
 	if (envcopy == NULL)
 		return (errno);
 	memcpy(envcopy, envp, envsize);
 
-	error = spt_clearenv();
-	if (error) {
-		environ = envp;
-		free(envcopy);
-		return (error);
-	}
+	environ = NULL;
 
 	for (i = 0; envcopy[i]; i++) {
 		eq = strchr(envcopy[i], '=');
 		if (eq == NULL)
 			continue;
 
 		*eq = '\0';
 		if (setenv(envcopy[i], eq + 1, 1) < 0)
 			error = errno;
 		*eq = '=';
 
 		if (error) {
+			clearenv();
 			environ = envp;
 			free(envcopy);
 			return (error);
 		}
 	}
 
 	/*
 	 * Dispose of the shallow copy, now that we've finished transfering
 	 * the old environment.
 	 */
 	free(envcopy);
 
 	return (0);
 }
 
 static int
 spt_copyargs(int argc, char *argv[])
 {
 	char *tmp;
 	int i;
 
 	for (i = 1; i < argc || (i >= argc && argv[i]); i++) {
 		if (argv[i] == NULL)
 			continue;
 
 		tmp = strdup(argv[i]);
 		if (tmp == NULL)
 			return (errno);
 
 		argv[i] = tmp;
 	}
 
 	return (0);
 }
 
 void
 zfs_setproctitle_init(int argc, char *argv[], char *envp[])
 {
 	char *base, *end, *nul, *tmp;
 	int i, envc, error;
 
 	/* Try to make sure we got called with main() arguments. */
 	if (argc < 0)
 		return;
 
 	base = argv[0];
 	if (base == NULL)
 		return;
 
 	nul = base + strlen(base);
 	end = nul + 1;
 
 	for (i = 0; i < argc || (i >= argc && argv[i]); i++) {
 		if (argv[i] == NULL || argv[i] != end)
 			continue;
 
 		end = argv[i] + strlen(argv[i]) + 1;
 	}
 
 	for (i = 0; envp[i]; i++) {
 		if (envp[i] != end)
 			continue;
 
 		end = envp[i] + strlen(envp[i]) + 1;
 	}
 	envc = i;
 
 	SPT.arg0 = strdup(argv[0]);
 	if (SPT.arg0 == NULL) {
 		SPT.error = errno;
 		return;
 	}
 
 	tmp = strdup(getprogname());
 	if (tmp == NULL) {
 		SPT.error = errno;
 		return;
 	}
 	setprogname(tmp);
 
 	error = spt_copyenv(envc, envp);
 	if (error) {
 		SPT.error = error;
 		return;
 	}
 
 	error = spt_copyargs(argc, argv);
 	if (error) {
 		SPT.error = error;
 		return;
 	}
 
 	SPT.nul  = nul;
 	SPT.base = base;
 	SPT.end  = end;
 }
 
 void
 zfs_setproctitle(const char *fmt, ...)
 {
 	/* Use buffer in case argv[0] is passed. */
 	char buf[SPT_MAXTITLE + 1];
 	va_list ap;
 	char *nul;
 	int len;
 	if (SPT.base == NULL) {
 		if (!SPT.warned) {
 			warnx("setproctitle not initialized, please"
 			    "call zfs_setproctitle_init()");
 			SPT.warned = B_TRUE;
 		}
 		return;
 	}
 
 	if (fmt) {
 		if (fmt[0] == '-') {
 			/* Skip program name prefix. */
 			fmt++;
 			len = 0;
 		} else {
 			/* Print program name heading for grep. */
 			snprintf(buf, sizeof (buf), "%s: ", getprogname());
 			len = strlen(buf);
 		}
 
 		va_start(ap, fmt);
 		len += vsnprintf(buf + len, sizeof (buf) - len, fmt, ap);
 		va_end(ap);
 	} else {
 		len = snprintf(buf, sizeof (buf), "%s", SPT.arg0);
 	}
 
 	if (len <= 0) {
 		SPT.error = errno;
 		return;
 	}
 
 	if (!SPT.reset) {
 		memset(SPT.base, 0, SPT.end - SPT.base);
 		SPT.reset = B_TRUE;
 	} else {
 		memset(SPT.base, 0, spt_min(sizeof (buf), SPT.end - SPT.base));
 	}
 
 	len = spt_min(len, spt_min(sizeof (buf), SPT.end - SPT.base) - 1);
 	memcpy(SPT.base, buf, len);
 	nul = SPT.base + len;
 
 	if (nul < SPT.nul) {
 		*SPT.nul = '.';
 	} else if (nul == SPT.nul && nul + 1 < SPT.end) {
 		*SPT.nul = ' ';
 		*++nul = '\0';
 	}
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
index 4d67cbb183ec..c820d7a6d22d 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
@@ -1,222 +1,219 @@
 /*
  * Copyright (c) 2008, 2009 Edward Tomasz Napierała <trasz@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/errno.h>
 #include <sys/zfs_acl.h>
 #include <sys/acl.h>
 
 struct zfs2bsd {
 	uint32_t	zb_zfs;
 	int		zb_bsd;
 };
 
 static const struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA},
 			{ACE_WRITE_DATA, ACL_WRITE_DATA},
 			{ACE_EXECUTE, ACL_EXECUTE},
 			{ACE_APPEND_DATA, ACL_APPEND_DATA},
 			{ACE_DELETE_CHILD, ACL_DELETE_CHILD},
 			{ACE_DELETE, ACL_DELETE},
 			{ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES},
 			{ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
 			{ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
 			{ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
 			{ACE_READ_ACL, ACL_READ_ACL},
 			{ACE_WRITE_ACL, ACL_WRITE_ACL},
 			{ACE_WRITE_OWNER, ACL_WRITE_OWNER},
 			{ACE_SYNCHRONIZE, ACL_SYNCHRONIZE},
 			{0, 0}};
 
 static const struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE,
 			    ACL_ENTRY_FILE_INHERIT},
 			{ACE_DIRECTORY_INHERIT_ACE,
 			    ACL_ENTRY_DIRECTORY_INHERIT},
 			{ACE_NO_PROPAGATE_INHERIT_ACE,
 			    ACL_ENTRY_NO_PROPAGATE_INHERIT},
 			{ACE_INHERIT_ONLY_ACE,
 			    ACL_ENTRY_INHERIT_ONLY},
 			{ACE_INHERITED_ACE,
 			    ACL_ENTRY_INHERITED},
 			{ACE_SUCCESSFUL_ACCESS_ACE_FLAG,
 			    ACL_ENTRY_SUCCESSFUL_ACCESS},
 			{ACE_FAILED_ACCESS_ACE_FLAG,
 			    ACL_ENTRY_FAILED_ACCESS},
 			{0, 0}};
 
 static int
 _bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table)
 {
 	const struct zfs2bsd *tmp;
 	int bsd = 0;
 
 	for (tmp = table; tmp->zb_zfs != 0; tmp++) {
 		if (zfs & tmp->zb_zfs)
 			bsd |= tmp->zb_bsd;
 	}
 
 	return (bsd);
 }
 
 static uint32_t
 _zfs_from_bsd(int bsd, const struct zfs2bsd *table)
 {
 	const struct zfs2bsd *tmp;
 	uint32_t zfs = 0;
 
 	for (tmp = table; tmp->zb_bsd != 0; tmp++) {
 		if (bsd & tmp->zb_bsd)
 			zfs |= tmp->zb_zfs;
 	}
 
 	return (zfs);
 }
 
 int
 acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries)
 {
 	int i;
 	struct acl_entry *entry;
 	const ace_t *ace;
 
 	if (nentries < 1) {
 		printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n");
 		return (EINVAL);
 	}
 
 	if (nentries > ACL_MAX_ENTRIES) {
 		/*
 		 * I believe it may happen only when moving a pool
 		 * from SunOS to FreeBSD.
 		 */
 		printf("acl_from_aces: ZFS ACL too big to fit "
 		    "into 'struct acl'; returning EINVAL.\n");
 		return (EINVAL);
 	}
 
 	memset(aclp, 0, sizeof (*aclp));
 	aclp->acl_maxcnt = ACL_MAX_ENTRIES;
 	aclp->acl_cnt = nentries;
 
 	for (i = 0; i < nentries; i++) {
 		entry = &(aclp->acl_entry[i]);
 		ace = &(aces[i]);
 
 		if (ace->a_flags & ACE_OWNER)
 			entry->ae_tag = ACL_USER_OBJ;
 		else if (ace->a_flags & ACE_GROUP)
 			entry->ae_tag = ACL_GROUP_OBJ;
 		else if (ace->a_flags & ACE_EVERYONE)
 			entry->ae_tag = ACL_EVERYONE;
 		else if (ace->a_flags & ACE_IDENTIFIER_GROUP)
 			entry->ae_tag = ACL_GROUP;
 		else
 			entry->ae_tag = ACL_USER;
 
 		if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP)
 			entry->ae_id = ace->a_who;
 		else
 			entry->ae_id = ACL_UNDEFINED_ID;
 
 		entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms);
 		entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags);
 
 		switch (ace->a_type) {
 		case ACE_ACCESS_ALLOWED_ACE_TYPE:
 			entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW;
 			break;
 		case ACE_ACCESS_DENIED_ACE_TYPE:
 			entry->ae_entry_type = ACL_ENTRY_TYPE_DENY;
 			break;
 		case ACE_SYSTEM_AUDIT_ACE_TYPE:
 			entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT;
 			break;
 		case ACE_SYSTEM_ALARM_ACE_TYPE:
 			entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM;
 			break;
 		default:
 			panic("acl_from_aces: a_type is 0x%x", ace->a_type);
 		}
 	}
 
 	return (0);
 }
 
 void
 aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp)
 {
 	int i;
 	const struct acl_entry *entry;
 	ace_t *ace;
 
 	memset(aces, 0, sizeof (*aces) * aclp->acl_cnt);
 
 	*nentries = aclp->acl_cnt;
 
 	for (i = 0; i < aclp->acl_cnt; i++) {
 		entry = &(aclp->acl_entry[i]);
 		ace = &(aces[i]);
 
 		ace->a_who = entry->ae_id;
 
 		if (entry->ae_tag == ACL_USER_OBJ)
 			ace->a_flags = ACE_OWNER;
 		else if (entry->ae_tag == ACL_GROUP_OBJ)
 			ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP);
 		else if (entry->ae_tag == ACL_GROUP)
 			ace->a_flags = ACE_IDENTIFIER_GROUP;
 		else if (entry->ae_tag == ACL_EVERYONE)
 			ace->a_flags = ACE_EVERYONE;
 		else /* ACL_USER */
 			ace->a_flags = 0;
 
 		ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms);
 		ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags);
 
 		switch (entry->ae_entry_type) {
 		case ACL_ENTRY_TYPE_ALLOW:
 			ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
 			break;
 		case ACL_ENTRY_TYPE_DENY:
 			ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
 			break;
 		case ACL_ENTRY_TYPE_ALARM:
 			ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE;
 			break;
 		case ACL_ENTRY_TYPE_AUDIT:
 			ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE;
 			break;
 		default:
 			panic("aces_from_acl: ae_entry_type is 0x%x",
 			    entry->ae_entry_type);
 		}
 	}
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
index 80040fc6a3e3..cdfd37f3e05f 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
@@ -1,123 +1,120 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/atomic.h>
 
 #if !defined(__LP64__) && !defined(__mips_n32) && \
 	!defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \
 	!defined(HAS_EMULATED_ATOMIC64)
 
 #ifdef _KERNEL
 #include <sys/kernel.h>
 
 struct mtx atomic_mtx;
 MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
 #else
 #include <pthread.h>
 
 #define	mtx_lock(lock)		pthread_mutex_lock(lock)
 #define	mtx_unlock(lock)	pthread_mutex_unlock(lock)
 
 static pthread_mutex_t atomic_mtx;
 
 static __attribute__((constructor)) void
 atomic_init(void)
 {
 	pthread_mutex_init(&atomic_mtx, NULL);
 }
 #endif
 
 void
 atomic_add_64(volatile uint64_t *target, int64_t delta)
 {
 
 	mtx_lock(&atomic_mtx);
 	*target += delta;
 	mtx_unlock(&atomic_mtx);
 }
 
 void
 atomic_dec_64(volatile uint64_t *target)
 {
 
 	mtx_lock(&atomic_mtx);
 	*target -= 1;
 	mtx_unlock(&atomic_mtx);
 }
 
 uint64_t
 atomic_swap_64(volatile uint64_t *a, uint64_t value)
 {
 	uint64_t ret;
 
 	mtx_lock(&atomic_mtx);
 	ret = *a;
 	*a = value;
 	mtx_unlock(&atomic_mtx);
 	return (ret);
 }
 
 uint64_t
 atomic_load_64(volatile uint64_t *a)
 {
 	uint64_t ret;
 
 	mtx_lock(&atomic_mtx);
 	ret = *a;
 	mtx_unlock(&atomic_mtx);
 	return (ret);
 }
 
 uint64_t
 atomic_add_64_nv(volatile uint64_t *target, int64_t delta)
 {
 	uint64_t newval;
 
 	mtx_lock(&atomic_mtx);
 	newval = (*target += delta);
 	mtx_unlock(&atomic_mtx);
 	return (newval);
 }
 
 uint64_t
 atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
 {
 	uint64_t oldval;
 
 	mtx_lock(&atomic_mtx);
 	oldval = *target;
 	if (oldval == cmp)
 		*target = newval;
 	mtx_unlock(&atomic_mtx);
 	return (oldval);
 }
 #endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
index 6b2872bcc066..4b9cc65d641e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
@@ -1,38 +1,35 @@
 /*
  * Copyright 2014 The FreeBSD Project.
  * All rights reserved.
  *
  * This software was developed by Steven Hartland.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 
 /* CSTYLED */
 SDT_PROBE_DEFINE1(sdt, , , set__error, "int");
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
index ca9a677567d9..95af6200cd01 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
@@ -1,352 +1,349 @@
 /*
  * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/byteorder.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/debug.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 
 
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 
 #ifdef KMEM_DEBUG
 #include <sys/queue.h>
 #include <sys/stack.h>
 #endif
 
 #ifdef _KERNEL
 MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
 #else
 #define	malloc(size, type, flags)	malloc(size)
 #define	free(addr, type)		free(addr)
 #endif
 
 #ifdef KMEM_DEBUG
 struct kmem_item {
 	struct stack	stack;
 	LIST_ENTRY(kmem_item) next;
 };
 static LIST_HEAD(, kmem_item) kmem_items;
 static struct mtx kmem_items_mtx;
 MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF);
 #endif	/* KMEM_DEBUG */
 
 #include <sys/vmem.h>
 
 void *
 zfs_kmem_alloc(size_t size, int kmflags)
 {
 	void *p;
 #ifdef KMEM_DEBUG
 	struct kmem_item *i;
 
 	size += sizeof (struct kmem_item);
 #endif
 	p = malloc(MAX(size, 16), M_SOLARIS, kmflags);
 #ifndef _KERNEL
 	if (kmflags & KM_SLEEP)
 		assert(p != NULL);
 #endif
 #ifdef KMEM_DEBUG
 	if (p != NULL) {
 		i = p;
 		p = (uint8_t *)p + sizeof (struct kmem_item);
 		stack_save(&i->stack);
 		mtx_lock(&kmem_items_mtx);
 		LIST_INSERT_HEAD(&kmem_items, i, next);
 		mtx_unlock(&kmem_items_mtx);
 	}
 #endif
 	return (p);
 }
 
 void
 zfs_kmem_free(void *buf, size_t size __unused)
 {
 #ifdef KMEM_DEBUG
 	if (buf == NULL) {
 		printf("%s: attempt to free NULL\n", __func__);
 		return;
 	}
 	struct kmem_item *i;
 
 	buf = (uint8_t *)buf - sizeof (struct kmem_item);
 	mtx_lock(&kmem_items_mtx);
 	LIST_FOREACH(i, &kmem_items, next) {
 		if (i == buf)
 			break;
 	}
 	ASSERT3P(i, !=, NULL);
 	LIST_REMOVE(i, next);
 	mtx_unlock(&kmem_items_mtx);
 	memset(buf, 0xDC, MAX(size, 16));
 #endif
 	free(buf, M_SOLARIS);
 }
 
 static uint64_t kmem_size_val;
 
 static void
 kmem_size_init(void *unused __unused)
 {
 
 	kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE;
 	if (kmem_size_val > vm_kmem_size)
 		kmem_size_val = vm_kmem_size;
 }
 SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
 
 uint64_t
 kmem_size(void)
 {
 
 	return (kmem_size_val);
 }
 
 static int
 kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
 {
 	struct kmem_cache *cache = private;
 
 	return (cache->kc_constructor(mem, cache->kc_private, flags));
 }
 
 static void
 kmem_std_destructor(void *mem, int size __unused, void *private)
 {
 	struct kmem_cache *cache = private;
 
 	cache->kc_destructor(mem, cache->kc_private);
 }
 
 kmem_cache_t *
 kmem_cache_create(const char *name, size_t bufsize, size_t align,
     int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags)
 {
 	kmem_cache_t *cache;
 
 	ASSERT3P(vmp, ==, NULL);
 
 	cache = kmem_alloc(sizeof (*cache), KM_SLEEP);
 	strlcpy(cache->kc_name, name, sizeof (cache->kc_name));
 	cache->kc_constructor = constructor;
 	cache->kc_destructor = destructor;
 	cache->kc_private = private;
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
 	    constructor != NULL ? kmem_std_constructor : NULL,
 	    destructor != NULL ? kmem_std_destructor : NULL,
 	    NULL, NULL, align > 0 ? align - 1 : 0, cflags);
 #else
 	cache->kc_size = bufsize;
 #endif
 
 	return (cache);
 }
 
 void
 kmem_cache_destroy(kmem_cache_t *cache)
 {
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	uma_zdestroy(cache->kc_zone);
 #endif
 	kmem_free(cache, sizeof (*cache));
 }
 
 void *
 kmem_cache_alloc(kmem_cache_t *cache, int flags)
 {
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	return (uma_zalloc_arg(cache->kc_zone, cache, flags));
 #else
 	void *p;
 
 	p = kmem_alloc(cache->kc_size, flags);
 	if (p != NULL && cache->kc_constructor != NULL)
 		kmem_std_constructor(p, cache->kc_size, cache, flags);
 	return (p);
 #endif
 }
 
 void
 kmem_cache_free(kmem_cache_t *cache, void *buf)
 {
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	uma_zfree_arg(cache->kc_zone, buf, cache);
 #else
 	if (cache->kc_destructor != NULL)
 		kmem_std_destructor(buf, cache->kc_size, cache);
 	kmem_free(buf, cache->kc_size);
 #endif
 }
 
 /*
  * Allow our caller to determine if there are running reaps.
  *
  * This call is very conservative and may return B_TRUE even when
  * reaping activity isn't active. If it returns B_FALSE, then reaping
  * activity is definitely inactive.
  */
 boolean_t
 kmem_cache_reap_active(void)
 {
 
 	return (B_FALSE);
 }
 
 /*
  * Reap (almost) everything soon.
  *
  * Note: this does not wait for the reap-tasks to complete. Caller
  * should use kmem_cache_reap_active() (above) and/or moderation to
  * avoid scheduling too many reap-tasks.
  */
 #ifdef _KERNEL
 void
 kmem_cache_reap_soon(kmem_cache_t *cache)
 {
 #ifndef KMEM_DEBUG
 #if __FreeBSD_version >= 1300043
 	uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN);
 #else
 	zone_drain(cache->kc_zone);
 #endif
 #endif
 }
 
 void
 kmem_reap(void)
 {
 #if __FreeBSD_version >= 1300043
 	uma_reclaim(UMA_RECLAIM_TRIM);
 #else
 	uma_reclaim();
 #endif
 }
 #else
 void
 kmem_cache_reap_soon(kmem_cache_t *cache __unused)
 {
 }
 
 void
 kmem_reap(void)
 {
 }
 #endif
 
 int
 kmem_debugging(void)
 {
 	return (0);
 }
 
 void *
 calloc(size_t n, size_t s)
 {
 	return (kmem_zalloc(n * s, KM_NOSLEEP));
 }
 
 char *
 kmem_vasprintf(const char *fmt, va_list adx)
 {
 	char *msg;
 	va_list adx2;
 
 	va_copy(adx2, adx);
 	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
 	(void) vsprintf(msg, fmt, adx2);
 	va_end(adx2);
 
 	return (msg);
 }
 
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #ifdef KMEM_DEBUG
 #error "KMEM_DEBUG not currently supported"
 #endif
 
 uint64_t
 spl_kmem_cache_inuse(kmem_cache_t *cache)
 {
 	return (uma_zone_get_cur(cache->kc_zone));
 }
 
 uint64_t
 spl_kmem_cache_entry_size(kmem_cache_t *cache)
 {
 	return (cache->kc_zone->uz_size);
 }
 
 /*
  * Register a move callback for cache defragmentation.
  * XXX: Unimplemented but harmless to stub out for now.
  */
 void
 spl_kmem_cache_set_move(kmem_cache_t *skc,
     kmem_cbrc_t (move)(void *, void *, size_t, void *))
 {
 	ASSERT3P(move, !=, NULL);
 }
 
 #ifdef KMEM_DEBUG
 void kmem_show(void *);
 void
 kmem_show(void *dummy __unused)
 {
 	struct kmem_item *i;
 
 	mtx_lock(&kmem_items_mtx);
 	if (LIST_EMPTY(&kmem_items))
 		printf("KMEM_DEBUG: No leaked elements.\n");
 	else {
 		printf("KMEM_DEBUG: Leaked elements:\n\n");
 		LIST_FOREACH(i, &kmem_items, next) {
 			printf("address=%p\n", i);
 			stack_print_ddb(&i->stack);
 			printf("\n");
 		}
 	}
 	mtx_unlock(&kmem_items_mtx);
 }
 
 SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL);
 #endif	/* KMEM_DEBUG */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
index 43cd4da02e30..f657ef2a3acb 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
@@ -1,573 +1,570 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Links to Illumos.org for more information on kstat function:
  * [1] https://illumos.org/man/1M/kstat
  * [2] https://illumos.org/man/9f/kstat_create
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/kstat.h>
 #include <sys/sbuf.h>
 #include <sys/zone.h>
 
 static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics");
 
 SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics");
 
 void
 __kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index))
 {
 	ksp->ks_raw_ops.headers = headers;
 	ksp->ks_raw_ops.data    = data;
 	ksp->ks_raw_ops.addr    = addr;
 }
 
 void
 __kstat_set_seq_raw_ops(kstat_t *ksp,
     int (*headers)(struct seq_file *f),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index))
 {
 	ksp->ks_raw_ops.seq_headers = headers;
 	ksp->ks_raw_ops.data    = data;
 	ksp->ks_raw_ops.addr    = addr;
 }
 
 static int
 kstat_default_update(kstat_t *ksp, int rw)
 {
 	ASSERT3P(ksp, !=, NULL);
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 static int
 kstat_resize_raw(kstat_t *ksp)
 {
 	if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
 		return (ENOMEM);
 
 	free(ksp->ks_raw_buf, M_TEMP);
 	ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
 	ksp->ks_raw_buf = malloc(ksp->ks_raw_bufsize, M_TEMP, M_WAITOK);
 
 	return (0);
 }
 
 static void *
 kstat_raw_default_addr(kstat_t *ksp, loff_t n)
 {
 	if (n == 0)
 		return (ksp->ks_data);
 	return (NULL);
 }
 
 static int
 kstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	kstat_t *ksp = arg1;
 	kstat_named_t *ksent;
 	uint64_t val;
 
 	ksent = ksp->ks_data;
 	/* Select the correct element */
 	ksent += arg2;
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 	val = ksent->value.ui64;
 
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 static int
 kstat_sysctl_string(SYSCTL_HANDLER_ARGS)
 {
 	kstat_t *ksp = arg1;
 	kstat_named_t *ksent = ksp->ks_data;
 	char *val;
 	uint32_t len = 0;
 
 	/* Select the correct element */
 	ksent += arg2;
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 	val = KSTAT_NAMED_STR_PTR(ksent);
 	len = KSTAT_NAMED_STR_BUFLEN(ksent);
 	val[len-1] = '\0';
 
 	return (sysctl_handle_string(oidp, val, len, req));
 }
 
 static int
 kstat_sysctl_dataset(SYSCTL_HANDLER_ARGS)
 {
 	kstat_t *ksp = arg1;
 	kstat_named_t *ksent;
 	kstat_named_t *ksent_ds;
 	uint64_t val;
 	char *ds_name;
 	uint32_t ds_len = 0;
 
 	ksent_ds = ksent = ksp->ks_data;
 	ds_name = KSTAT_NAMED_STR_PTR(ksent_ds);
 	ds_len = KSTAT_NAMED_STR_BUFLEN(ksent_ds);
 	ds_name[ds_len-1] = '\0';
 
 	if (!zone_dataset_visible(ds_name, NULL)) {
 		return (EPERM);
 	}
 
 	/* Select the correct element */
 	ksent += arg2;
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 	val = ksent->value.ui64;
 
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 static int
 kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS)
 {
 	kstat_t *ksp = arg1;
 	kstat_named_t *ksent = ksp->ks_data;
 	char *val;
 	uint32_t len = 0;
 
 	/* Select the correct element */
 	ksent += arg2;
 	val = KSTAT_NAMED_STR_PTR(ksent);
 	len = KSTAT_NAMED_STR_BUFLEN(ksent);
 	val[len-1] = '\0';
 
 	if (!zone_dataset_visible(val, NULL)) {
 		return (EPERM);
 	}
 
 	return (sysctl_handle_string(oidp, val, len, req));
 }
 
 static int
 kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	kstat_t *ksp = arg1;
 	kstat_io_t *kip = ksp->ks_data;
 	int rc;
 
 	sbuf_new_for_sysctl(&sb, NULL, 0, req);
 
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 
 	/* though wlentime & friends are signed, they will never be negative */
 	sbuf_printf(&sb,
 	    "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
 	    "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
 	    kip->nread, kip->nwritten,
 	    kip->reads, kip->writes,
 	    kip->wtime, kip->wlentime, kip->wlastupdate,
 	    kip->rtime, kip->rlentime, kip->rlastupdate,
 	    kip->wcnt,  kip->rcnt);
 	rc = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 static int
 kstat_sysctl_raw(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	void *data;
 	kstat_t *ksp = arg1;
 	void *(*addr_op)(kstat_t *ksp, loff_t index);
 	int n, has_header, rc = 0;
 
 	sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req);
 
 	if (ksp->ks_raw_ops.addr)
 		addr_op = ksp->ks_raw_ops.addr;
 	else
 		addr_op = kstat_raw_default_addr;
 
 	mutex_enter(ksp->ks_lock);
 
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 
 	ksp->ks_raw_bufsize = PAGE_SIZE;
 	ksp->ks_raw_buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
 
 	n = 0;
 	has_header = (ksp->ks_raw_ops.headers ||
 	    ksp->ks_raw_ops.seq_headers);
 
 restart_headers:
 	if (ksp->ks_raw_ops.headers) {
 		rc = ksp->ks_raw_ops.headers(
 		    ksp->ks_raw_buf, ksp->ks_raw_bufsize);
 	} else if (ksp->ks_raw_ops.seq_headers) {
 		struct seq_file f;
 
 		f.sf_buf = ksp->ks_raw_buf;
 		f.sf_size = ksp->ks_raw_bufsize;
 		rc = ksp->ks_raw_ops.seq_headers(&f);
 	}
 	if (has_header) {
 		if (rc == ENOMEM && !kstat_resize_raw(ksp))
 			goto restart_headers;
 		if (rc == 0) {
 			sbuf_cat(&sb, "\n");
 			sbuf_cat(&sb, ksp->ks_raw_buf);
 		}
 	}
 
 	while ((data = addr_op(ksp, n)) != NULL) {
 restart:
 		if (ksp->ks_raw_ops.data) {
 			rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf,
 			    ksp->ks_raw_bufsize, data);
 			if (rc == ENOMEM && !kstat_resize_raw(ksp))
 				goto restart;
 			if (rc == 0)
 				sbuf_cat(&sb, ksp->ks_raw_buf);
 
 		} else {
 			ASSERT3U(ksp->ks_ndata, ==, 1);
 			sbuf_hexdump(&sb, ksp->ks_data,
 			    ksp->ks_data_size, NULL, 0);
 		}
 		n++;
 	}
 	free(ksp->ks_raw_buf, M_TEMP);
 	mutex_exit(ksp->ks_lock);
 	rc = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 kstat_t *
 __kstat_create(const char *module, int instance, const char *name,
     const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags)
 {
 	char buf[KSTAT_STRLEN];
 	struct sysctl_oid *root;
 	kstat_t *ksp;
 	char *pool;
 
 	KASSERT(instance == 0, ("instance=%d", instance));
 	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
 		ASSERT3U(ks_ndata, ==, 1);
 
 	if (class == NULL)
 		class = "misc";
 
 	/*
 	 * Allocate the main structure. We don't need to keep a copy of
 	 * module in here, because it is only used for sysctl node creation
 	 * done in this function.
 	 */
 	ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO);
 
 	ksp->ks_crtime = gethrtime();
 	ksp->ks_snaptime = ksp->ks_crtime;
 	ksp->ks_instance = instance;
 	(void) strlcpy(ksp->ks_name, name, KSTAT_STRLEN);
 	(void) strlcpy(ksp->ks_class, class, KSTAT_STRLEN);
 	ksp->ks_type = ks_type;
 	ksp->ks_flags = flags;
 	ksp->ks_update = kstat_default_update;
 
 	mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
 	ksp->ks_lock = &ksp->ks_private_lock;
 
 	switch (ksp->ks_type) {
 	case KSTAT_TYPE_RAW:
 		ksp->ks_ndata = 1;
 		ksp->ks_data_size = ks_ndata;
 		break;
 	case KSTAT_TYPE_NAMED:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
 		break;
 	case KSTAT_TYPE_INTR:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
 		break;
 	case KSTAT_TYPE_IO:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
 		break;
 	case KSTAT_TYPE_TIMER:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
 		break;
 	default:
 		panic("Undefined kstat type %d\n", ksp->ks_type);
 	}
 
 	if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL)
 		ksp->ks_data = NULL;
 	else
 		ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
 
 	/*
 	 * Some kstats use a module name like "zfs/poolname" to distinguish a
 	 * set of kstats belonging to a specific pool.  Split on '/' to add an
 	 * extra node for the pool name if needed.
 	 */
 	(void) strlcpy(buf, module, KSTAT_STRLEN);
 	module = buf;
 	pool = strchr(module, '/');
 	if (pool != NULL)
 		*pool++ = '\0';
 
 	/*
 	 * Create sysctl tree for those statistics:
 	 *
 	 *	kstat.<module>[.<pool>].<class>.<name>
 	 */
 	sysctl_ctx_init(&ksp->ks_sysctl_ctx);
 	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0,
 	    "");
 	if (root == NULL) {
 		printf("%s: Cannot create kstat.%s tree!\n", __func__, module);
 		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 		free(ksp, M_KSTAT);
 		return (NULL);
 	}
 	if (pool != NULL) {
 		root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
 		    SYSCTL_CHILDREN(root), OID_AUTO, pool, CTLFLAG_RW, 0, "");
 		if (root == NULL) {
 			printf("%s: Cannot create kstat.%s.%s tree!\n",
 			    __func__, module, pool);
 			sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 			free(ksp, M_KSTAT);
 			return (NULL);
 		}
 	}
 	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
 	    OID_AUTO, class, CTLFLAG_RW, 0, "");
 	if (root == NULL) {
 		if (pool != NULL)
 			printf("%s: Cannot create kstat.%s.%s.%s tree!\n",
 			    __func__, module, pool, class);
 		else
 			printf("%s: Cannot create kstat.%s.%s tree!\n",
 			    __func__, module, class);
 		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 		free(ksp, M_KSTAT);
 		return (NULL);
 	}
 	if (ksp->ks_type == KSTAT_TYPE_NAMED) {
 		root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
 		    SYSCTL_CHILDREN(root),
 		    OID_AUTO, name, CTLFLAG_RW, 0, "");
 		if (root == NULL) {
 			if (pool != NULL)
 				printf("%s: Cannot create kstat.%s.%s.%s.%s "
 				    "tree!\n", __func__, module, pool, class,
 				    name);
 			else
 				printf("%s: Cannot create kstat.%s.%s.%s "
 				    "tree!\n", __func__, module, class, name);
 			sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 			free(ksp, M_KSTAT);
 			return (NULL);
 		}
 
 	}
 	ksp->ks_sysctl_root = root;
 
 	return (ksp);
 }
 
 static void
 kstat_install_named(kstat_t *ksp)
 {
 	kstat_named_t *ksent;
 	char *namelast;
 	int typelast;
 
 	ksent = ksp->ks_data;
 
 	VERIFY((ksp->ks_flags & KSTAT_FLAG_VIRTUAL) || ksent != NULL);
 
 	typelast = 0;
 	namelast = NULL;
 
 	for (int i = 0; i < ksp->ks_ndata; i++, ksent++) {
 		if (ksent->data_type != 0) {
 			typelast = ksent->data_type;
 			namelast = ksent->name;
 		}
 		switch (typelast) {
 		case KSTAT_DATA_CHAR:
 			/* Not Implemented */
 			break;
 		case KSTAT_DATA_INT32:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_S32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "I", namelast);
 			break;
 		case KSTAT_DATA_UINT32:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_U32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "IU", namelast);
 			break;
 		case KSTAT_DATA_INT64:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "Q", namelast);
 			break;
 		case KSTAT_DATA_UINT64:
 			if (strcmp(ksp->ks_class, "dataset") == 0) {
 				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 				    OID_AUTO, namelast,
 				    CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 				    ksp, i, kstat_sysctl_dataset, "QU",
 				    namelast);
 			} else {
 				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 				    OID_AUTO, namelast,
 				    CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 				    ksp, i, kstat_sysctl, "QU", namelast);
 			}
 			break;
 		case KSTAT_DATA_LONG:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "L", namelast);
 			break;
 		case KSTAT_DATA_ULONG:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "LU", namelast);
 			break;
 		case KSTAT_DATA_STRING:
 			if (strcmp(ksp->ks_class, "dataset") == 0) {
 				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 				    OID_AUTO, namelast, CTLTYPE_STRING |
 				    CTLFLAG_RD | CTLFLAG_MPSAFE,
 				    ksp, i, kstat_sysctl_dataset_string, "A",
 				    namelast);
 			} else {
 				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 				    OID_AUTO, namelast, CTLTYPE_STRING |
 				    CTLFLAG_RD | CTLFLAG_MPSAFE,
 				    ksp, i, kstat_sysctl_string, "A",
 				    namelast);
 			}
 			break;
 		default:
 			panic("unsupported type: %d", typelast);
 		}
 	}
 }
 
 void
 kstat_install(kstat_t *ksp)
 {
 	struct sysctl_oid *root;
 
 	if (ksp->ks_ndata == UINT32_MAX)
 		VERIFY3U(ksp->ks_type, ==, KSTAT_TYPE_RAW);
 
 	switch (ksp->ks_type) {
 	case KSTAT_TYPE_NAMED:
 		return (kstat_install_named(ksp));
 	case KSTAT_TYPE_RAW:
 		if (ksp->ks_raw_ops.data) {
 			root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, ksp->ks_name, CTLTYPE_STRING | CTLFLAG_RD
 			    | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
 			    ksp, 0, kstat_sysctl_raw, "A", ksp->ks_name);
 		} else {
 			root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, ksp->ks_name, CTLTYPE_OPAQUE | CTLFLAG_RD
 			    | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
 			    ksp, 0, kstat_sysctl_raw, "", ksp->ks_name);
 		}
 		break;
 	case KSTAT_TYPE_IO:
 		root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 		    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 		    OID_AUTO, ksp->ks_name,
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 		    ksp, 0, kstat_sysctl_io, "A", ksp->ks_name);
 		break;
 	case KSTAT_TYPE_TIMER:
 	case KSTAT_TYPE_INTR:
 	default:
 		panic("unsupported kstat type %d\n", ksp->ks_type);
 	}
 	VERIFY3P(root, !=, NULL);
 	ksp->ks_sysctl_root = root;
 }
 
 void
 kstat_delete(kstat_t *ksp)
 {
 
 	sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 	ksp->ks_lock = NULL;
 	mutex_destroy(&ksp->ks_private_lock);
 	if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
 		kmem_free(ksp->ks_data, ksp->ks_data_size);
 	free(ksp, M_KSTAT);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
index e3653167323b..a5fc996b6550 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
@@ -1,109 +1,106 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/misc.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <sys/zfs_context.h>
 
 static struct opensolaris_utsname hw_utsname = {
 	.machine = MACHINE
 };
 
 utsname_t *
 utsname(void)
 {
 	return (&hw_utsname);
 }
 
 static void
 opensolaris_utsname_init(void *arg)
 {
 
 	hw_utsname.sysname = ostype;
 	hw_utsname.nodename = prison0.pr_hostname;
 	hw_utsname.release = osrelease;
 	snprintf(hw_utsname.version, sizeof (hw_utsname.version),
 	    "%d", osreldate);
 }
 
 char *
 kmem_strdup(const char *s)
 {
 	char *buf;
 
 	buf = kmem_alloc(strlen(s) + 1, KM_SLEEP);
 	strcpy(buf, s);
 	return (buf);
 }
 
 int
 ddi_copyin(const void *from, void *to, size_t len, int flags)
 {
 	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
 	if (flags & FKIOCTL) {
 		memcpy(to, from, len);
 		return (0);
 	}
 
 	return (copyin(from, to, len));
 }
 
 int
 ddi_copyout(const void *from, void *to, size_t len, int flags)
 {
 	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
 	if (flags & FKIOCTL) {
 		memcpy(to, from, len);
 		return (0);
 	}
 
 	return (copyout(from, to, len));
 }
 
 void
 spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vpanic(fmt, ap);
 	va_end(ap);
 }
 
 
 SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
     opensolaris_utsname_init, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
index 5ecd3d310361..f2dd7c8e7f8a 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
@@ -1,438 +1,435 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/priv.h>
 #include <sys/vnode.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/jail.h>
 #include <sys/policy.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 
 
 int
 secpolicy_nfs(cred_t *cr)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON));
 }
 
 int
 secpolicy_zfs(cred_t *cr)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
 }
 
 int
 secpolicy_zfs_proc(cred_t *cr, proc_t *proc)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
 }
 
 int
 secpolicy_sys_config(cred_t *cr, int checkonly __unused)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG));
 }
 
 int
 secpolicy_zinject(cred_t *cr)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT));
 }
 
 int
 secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT));
 }
 
 int
 secpolicy_fs_owner(struct mount *mp, cred_t *cr)
 {
 
 	if (zfs_super_owner) {
 		if (cr->cr_uid == mp->mnt_cred->cr_uid &&
 		    cr->cr_prison == mp->mnt_cred->cr_prison) {
 			return (0);
 		}
 	}
 	return (EPERM);
 }
 
 /*
  * This check is done in kern_link(), so we could just return 0 here.
  */
 extern int hardlink_check_uid;
 int
 secpolicy_basic_link(vnode_t *vp, cred_t *cr)
 {
 
 	if (!hardlink_check_uid)
 		return (0);
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 	return (spl_priv_check_cred(cr, PRIV_VFS_LINK));
 }
 
 int
 secpolicy_vnode_stky_modify(cred_t *cr)
 {
 
 	return (EPERM);
 }
 
 int
 secpolicy_vnode_remove(vnode_t *vp, cred_t *cr)
 {
 
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 	return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
 }
 
 int
 secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode)
 {
 
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 
 	if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0)
 		return (EACCES);
 	if ((accmode & VWRITE) &&
 	    spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) {
 		return (EACCES);
 	}
 	if (accmode & VEXEC) {
 		if (vp->v_type == VDIR) {
 			if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0)
 				return (EACCES);
 		} else {
 			if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0)
 				return (EACCES);
 		}
 	}
 	return (0);
 }
 
 /*
  * Like secpolicy_vnode_access() but we get the actual wanted mode and the
  * current mode of the file, not the missing bits.
  */
 int
 secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
     accmode_t curmode, accmode_t wantmode)
 {
 	accmode_t mode;
 
 	mode = ~curmode & wantmode;
 
 	if (mode == 0)
 		return (0);
 
 	return (secpolicy_vnode_access(cr, vp, owner, mode));
 }
 
 int
 secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner)
 {
 	static int privs[] = {
 	    PRIV_VFS_ADMIN,
 	    PRIV_VFS_READ,
 	    PRIV_VFS_WRITE,
 	    PRIV_VFS_EXEC,
 	    PRIV_VFS_LOOKUP
 	};
 	int i;
 
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 
 	/* Same as secpolicy_vnode_setdac */
 	if (owner == cr->cr_uid)
 		return (0);
 
 	for (i = 0; i < sizeof (privs)/sizeof (int); i++) {
 		int priv;
 
 		switch (priv = privs[i]) {
 		case PRIV_VFS_EXEC:
 			if (vp->v_type == VDIR)
 				continue;
 			break;
 		case PRIV_VFS_LOOKUP:
 			if (vp->v_type != VDIR)
 				continue;
 			break;
 		}
 		if (spl_priv_check_cred(cr, priv) == 0)
 			return (0);
 	}
 	return (EPERM);
 }
 
 int
 secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner)
 {
 
 	if (owner == cr->cr_uid)
 		return (0);
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 	return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
 }
 
 int
 secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
     const struct vattr *ovap, int flags,
     int unlocked_access(void *, int, cred_t *), void *node)
 {
 	int mask = vap->va_mask;
 	int error;
 
 	if (mask & AT_SIZE) {
 		if (vp->v_type == VDIR)
 			return (EISDIR);
 		error = unlocked_access(node, VWRITE, cr);
 		if (error)
 			return (error);
 	}
 	if (mask & AT_MODE) {
 		/*
 		 * If not the owner of the file then check privilege
 		 * for two things: the privilege to set the mode at all
 		 * and, if we're setting setuid, we also need permissions
 		 * to add the set-uid bit, if we're not the owner.
 		 * In the specific case of creating a set-uid root
 		 * file, we need even more permissions.
 		 */
 		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
 		if (error)
 			return (error);
 		error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr);
 		if (error)
 			return (error);
 	} else {
 		vap->va_mode = ovap->va_mode;
 	}
 	if (mask & (AT_UID | AT_GID)) {
 		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
 		if (error)
 			return (error);
 
 		/*
 		 * To change the owner of a file, or change the group of
 		 * a file to a group of which we are not a member, the
 		 * caller must have privilege.
 		 */
 		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
 		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
 		    !groupmember(vap->va_gid, cr))) {
 			if (secpolicy_fs_owner(vp->v_mount, cr) != 0) {
 				error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN);
 				if (error)
 					return (error);
 			}
 		}
 
 		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
 		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
 			secpolicy_setid_clear(vap, vp, cr);
 		}
 	}
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		/*
 		 * From utimes(2):
 		 * If times is NULL, ... The caller must be the owner of
 		 * the file, have permission to write the file, or be the
 		 * super-user.
 		 * If times is non-NULL, ... The caller must be the owner of
 		 * the file or be the super-user.
 		 */
 		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
 		if (error && (vap->va_vaflags & VA_UTIMES_NULL))
 			error = unlocked_access(node, VWRITE, cr);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 int
 secpolicy_vnode_create_gid(cred_t *cr)
 {
 
 	return (EPERM);
 }
 
 int
 secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid)
 {
 
 	if (groupmember(gid, cr))
 		return (0);
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 	return (spl_priv_check_cred(cr, PRIV_VFS_SETGID));
 }
 
 int
 secpolicy_vnode_setid_retain(znode_t *zp, cred_t *cr,
     boolean_t issuidroot __unused)
 {
 
 	if (secpolicy_fs_owner(ZTOV(zp)->v_mount, cr) == 0)
 		return (0);
 	return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID));
 }
 
 void
 secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr)
 {
 
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return;
 
 	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
 		if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) {
 			vap->va_mask |= AT_MODE;
 			vap->va_mode &= ~(S_ISUID|S_ISGID);
 		}
 	}
 }
 
 int
 secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
     const struct vattr *ovap, cred_t *cr)
 {
 	int error;
 
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the process
 	 * is not a member of. Both of these are allowed in jail(8).
 	 */
 	if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) {
 		if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE))
 			return (EFTYPE);
 	}
 	/*
 	 * Check for privilege if attempting to set the
 	 * group-id bit.
 	 */
 	if ((vap->va_mode & S_ISGID) != 0) {
 		error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Deny setting setuid if we are not the file owner.
 	 */
 	if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) {
 		error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 int
 secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
 }
 
 int
 secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner)
 {
 
 	if (owner == cr->cr_uid)
 		return (0);
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 
 	/* XXX: vfs_suser()? */
 	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER));
 }
 
 int
 secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner)
 {
 
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 	return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN));
 }
 
 void
 secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
 {
 
 	if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) {
 		MNT_ILOCK(vfsp);
 		vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER;
 		vfs_clearmntopt(vfsp, MNTOPT_SETUID);
 		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0);
 		MNT_IUNLOCK(vfsp);
 	}
 }
 
 /*
  * Check privileges for setting xvattr attributes
  */
 int
 secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
     vtype_t vtype)
 {
 
 	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
 		return (0);
 	return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS));
 }
 
 int
 secpolicy_smb(cred_t *cr)
 {
 
 	return (spl_priv_check_cred(cr, PRIV_NETSMB));
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
index e8448ce00686..77d33ee2e1f3 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
@@ -1,161 +1,158 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/list.h>
 #include <sys/mutex.h>
 #include <sys/procfs_list.h>
 
 typedef struct procfs_list_iter {
 	procfs_list_t *pli_pl;
 	void *pli_elt;
 } pli_t;
 
 void
 seq_printf(struct seq_file *f, const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	(void) vsnprintf(f->sf_buf, f->sf_size, fmt, adx);
 	va_end(adx);
 }
 
 static int
 procfs_list_update(kstat_t *ksp, int rw)
 {
 	procfs_list_t *pl = ksp->ks_private;
 
 	if (rw == KSTAT_WRITE)
 		pl->pl_clear(pl);
 
 	return (0);
 }
 
 static int
 procfs_list_data(char *buf, size_t size, void *data)
 {
 	pli_t *p;
 	void *elt;
 	procfs_list_t *pl;
 	struct seq_file f;
 
 	p = data;
 	pl = p->pli_pl;
 	elt = p->pli_elt;
 	free(p, M_TEMP);
 	f.sf_buf = buf;
 	f.sf_size = size;
 	return (pl->pl_show(&f, elt));
 }
 
 static void *
 procfs_list_addr(kstat_t *ksp, loff_t n)
 {
 	procfs_list_t *pl = ksp->ks_private;
 	void *elt = ksp->ks_private1;
 	pli_t *p = NULL;
 
 
 	if (n == 0)
 		ksp->ks_private1 = list_head(&pl->pl_list);
 	else if (elt)
 		ksp->ks_private1 = list_next(&pl->pl_list, elt);
 
 	if (ksp->ks_private1) {
 		p = malloc(sizeof (*p), M_TEMP, M_WAITOK);
 		p->pli_pl = pl;
 		p->pli_elt = ksp->ks_private1;
 	}
 
 	return (p);
 }
 
 void
 procfs_list_install(const char *module,
     const char *submodule,
     const char *name,
     mode_t mode,
     procfs_list_t *procfs_list,
     int (*show)(struct seq_file *f, void *p),
     int (*show_header)(struct seq_file *f),
     int (*clear)(procfs_list_t *procfs_list),
     size_t procfs_list_node_off)
 {
 	kstat_t *procfs_kstat;
 
 	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&procfs_list->pl_list,
 	    procfs_list_node_off + sizeof (procfs_list_node_t),
 	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
 	procfs_list->pl_show = show;
 	procfs_list->pl_show_header = show_header;
 	procfs_list->pl_clear = clear;
 	procfs_list->pl_next_id = 1;
 	procfs_list->pl_node_offset = procfs_list_node_off;
 
 	procfs_kstat =  kstat_create(module, 0, name, submodule,
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 
 	if (procfs_kstat) {
 		procfs_kstat->ks_lock = &procfs_list->pl_lock;
 		procfs_kstat->ks_ndata = UINT32_MAX;
 		procfs_kstat->ks_private = procfs_list;
 		procfs_kstat->ks_update = procfs_list_update;
 		kstat_set_seq_raw_ops(procfs_kstat, show_header,
 		    procfs_list_data, procfs_list_addr);
 		kstat_install(procfs_kstat);
 		procfs_list->pl_private = procfs_kstat;
 	}
 }
 
 void
 procfs_list_uninstall(procfs_list_t *procfs_list)
 {}
 
 void
 procfs_list_destroy(procfs_list_t *procfs_list)
 {
 	ASSERT(list_is_empty(&procfs_list->pl_list));
 	kstat_delete(procfs_list->pl_private);
 	list_destroy(&procfs_list->pl_list);
 	mutex_destroy(&procfs_list->pl_lock);
 }
 
 #define	NODE_ID(procfs_list, obj) \
 		(((procfs_list_node_t *)(((char *)obj) + \
 		(procfs_list)->pl_node_offset))->pln_id)
 
 void
 procfs_list_add(procfs_list_t *procfs_list, void *p)
 {
 	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
 	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
 	list_insert_tail(&procfs_list->pl_list, p);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
index 2a3c027c9389..4c97c9f12caf 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
@@ -1,62 +1,59 @@
 /*
  * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/misc.h>
 #include <sys/sunddi.h>
 #include <sys/sysctl.h>
 
 int
 ddi_strtol(const char *str, char **nptr, int base, long *result)
 {
 
 	*result = strtol(str, nptr, base);
 	return (0);
 }
 
 int
 ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result)
 {
 
 	*result = (unsigned long long)strtouq(str, nptr, base);
 	return (0);
 }
 
 int
 ddi_strtoll(const char *str, char **nptr, int base, long long *result)
 {
 
 	*result = (long long)strtoq(str, nptr, base);
 	return (0);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
index 4a2d02350f62..dc5ed81057b8 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
@@ -1,272 +1,269 @@
 /*
  * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kmem.h>
 #include <sys/list.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/nvpair.h>
 #include <sys/sunddi.h>
 #include <sys/sysevent.h>
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/bus.h>
 
 static int
 log_sysevent(nvlist_t *event)
 {
 	struct sbuf *sb;
 	const char *type;
 	char typestr[128];
 	nvpair_t *elem = NULL;
 
 	sb = sbuf_new_auto();
 	if (sb == NULL)
 		return (ENOMEM);
 	type = NULL;
 
 	while ((elem = nvlist_next_nvpair(event, elem)) != NULL) {
 		switch (nvpair_type(elem)) {
 		case DATA_TYPE_BOOLEAN:
 		{
 			boolean_t value;
 
 			(void) nvpair_value_boolean_value(elem, &value);
 			sbuf_printf(sb, " %s=%s", nvpair_name(elem),
 			    value ? "true" : "false");
 			break;
 		}
 		case DATA_TYPE_UINT8:
 		{
 			uint8_t value;
 
 			(void) nvpair_value_uint8(elem, &value);
 			sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
 			break;
 		}
 		case DATA_TYPE_INT32:
 		{
 			int32_t value;
 
 			(void) nvpair_value_int32(elem, &value);
 			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
 			    (intmax_t)value);
 			break;
 		}
 		case DATA_TYPE_UINT32:
 		{
 			uint32_t value;
 
 			(void) nvpair_value_uint32(elem, &value);
 			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
 			    (uintmax_t)value);
 			break;
 		}
 		case DATA_TYPE_INT64:
 		{
 			int64_t value;
 
 			(void) nvpair_value_int64(elem, &value);
 			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
 			    (intmax_t)value);
 			break;
 		}
 		case DATA_TYPE_UINT64:
 		{
 			uint64_t value;
 
 			(void) nvpair_value_uint64(elem, &value);
 			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
 			    (uintmax_t)value);
 			break;
 		}
 		case DATA_TYPE_STRING:
 		{
 			const char *value;
 
 			(void) nvpair_value_string(elem, &value);
 			sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
 			if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
 				type = value;
 			break;
 		}
 		case DATA_TYPE_UINT8_ARRAY:
 		{
 			uint8_t *value;
 			uint_t ii, nelem;
 
 			(void) nvpair_value_uint8_array(elem, &value, &nelem);
 			sbuf_printf(sb, " %s=", nvpair_name(elem));
 			for (ii = 0; ii < nelem; ii++)
 				sbuf_printf(sb, "%02hhx", value[ii]);
 			break;
 		}
 		case DATA_TYPE_UINT16_ARRAY:
 		{
 			uint16_t *value;
 			uint_t ii, nelem;
 
 			(void) nvpair_value_uint16_array(elem, &value, &nelem);
 			sbuf_printf(sb, " %s=", nvpair_name(elem));
 			for (ii = 0; ii < nelem; ii++)
 				sbuf_printf(sb, "%04hx", value[ii]);
 			break;
 		}
 		case DATA_TYPE_UINT32_ARRAY:
 		{
 			uint32_t *value;
 			uint_t ii, nelem;
 
 			(void) nvpair_value_uint32_array(elem, &value, &nelem);
 			sbuf_printf(sb, " %s=", nvpair_name(elem));
 			for (ii = 0; ii < nelem; ii++)
 				sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
 			break;
 		}
 		case DATA_TYPE_INT64_ARRAY:
 		{
 			int64_t *value;
 			uint_t ii, nelem;
 
 			(void) nvpair_value_int64_array(elem, &value, &nelem);
 			sbuf_printf(sb, " %s=", nvpair_name(elem));
 			for (ii = 0; ii < nelem; ii++)
 				sbuf_printf(sb, "%016lld",
 				    (long long)value[ii]);
 			break;
 		}
 		case DATA_TYPE_UINT64_ARRAY:
 		{
 			uint64_t *value;
 			uint_t ii, nelem;
 
 			(void) nvpair_value_uint64_array(elem, &value, &nelem);
 			sbuf_printf(sb, " %s=", nvpair_name(elem));
 			for (ii = 0; ii < nelem; ii++)
 				sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
 			break;
 		}
 		case DATA_TYPE_STRING_ARRAY:
 		{
 			const char **strarr;
 			uint_t ii, nelem;
 
 			(void) nvpair_value_string_array(elem, &strarr, &nelem);
 
 			for (ii = 0; ii < nelem; ii++) {
 				if (strarr[ii] == NULL)  {
 					sbuf_printf(sb, " <NULL>");
 					continue;
 				}
 
 				sbuf_printf(sb, " %s", strarr[ii]);
 				if (strcmp(FM_CLASS, strarr[ii]) == 0)
 					type = strarr[ii];
 			}
 			break;
 		}
 		case DATA_TYPE_NVLIST:
 			/* XXX - requires recursing in log_sysevent */
 			break;
 		default:
 			printf("%s: type %d is not implemented\n", __func__,
 			    nvpair_type(elem));
 			break;
 		}
 	}
 
 	if (sbuf_finish(sb) != 0) {
 		sbuf_delete(sb);
 		return (ENOMEM);
 	}
 
 	if (type == NULL)
 		type = "";
 	if (strncmp(type, "ESC_ZFS_", 8) == 0) {
 		snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8);
 		type = typestr;
 	}
 	devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
 	sbuf_delete(sb);
 
 	return (0);
 }
 
 static void
 sysevent_worker(void *arg __unused)
 {
 	zfs_zevent_t *ze;
 	nvlist_t *event;
 	uint64_t dropped = 0;
 	uint64_t dst_size;
 	int error;
 
 	zfs_zevent_init(&ze);
 	for (;;) {
 		dst_size = 131072;
 		dropped = 0;
 		event = NULL;
 		error = zfs_zevent_next(ze, &event,
 		    &dst_size, &dropped);
 		if (error) {
 			error = zfs_zevent_wait(ze);
 			if (error == ESHUTDOWN)
 				break;
 		} else {
 			VERIFY3P(event, !=, NULL);
 			log_sysevent(event);
 			nvlist_free(event);
 		}
 	}
 
 	/*
 	 * We avoid zfs_zevent_destroy() here because we're otherwise racing
 	 * against fm_fini() destroying the zevent_lock.  zfs_zevent_destroy()
 	 * will currently only clear `ze->ze_zevent` from an event list then
 	 * free `ze`, so just inline the free() here -- events have already
 	 * been drained.
 	 */
 	VERIFY3P(ze->ze_zevent, ==, NULL);
 	kmem_free(ze, sizeof (zfs_zevent_t));
 
 	kthread_exit();
 }
 
 void
 ddi_sysevent_init(void)
 {
 	kproc_kthread_add(sysevent_worker, NULL, &system_proc, NULL, 0, 0,
 	    "zfskern", "sysevent");
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
index a005debaf6e3..81d49b7c8725 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
@@ -1,535 +1,532 @@
 /*
  * Copyright (c) 2009 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Copyright (c) 2012 Spectra Logic Corporation.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/kmem.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/taskq.h>
 #include <sys/taskqueue.h>
 #include <sys/zfs_context.h>
 
 #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
 #include <machine/pcb.h>
 #endif
 
 #include <vm/uma.h>
 
 #if __FreeBSD_version < 1201522
 #define	taskqueue_start_threads_in_proc(tqp, count, pri, proc, name, ...) \
     taskqueue_start_threads(tqp, count, pri, name, __VA_ARGS__)
 #endif
 
 static uint_t taskq_tsd;
 static uma_zone_t taskq_zone;
 
 /*
  * Global system-wide dynamic task queue available for all consumers. This
  * taskq is not intended for long-running tasks; instead, a dedicated taskq
  * should be created.
  */
 taskq_t *system_taskq = NULL;
 taskq_t *system_delay_taskq = NULL;
 taskq_t *dynamic_taskq = NULL;
 
 proc_t *system_proc;
 
 static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
 
 static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
 static unsigned long tqenthash;
 static unsigned long tqenthashlock;
 static struct sx *tqenthashtbl_lock;
 
 static taskqid_t tqidnext;
 
 #define	TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
 #define	TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
 
 #define	NORMAL_TASK 0
 #define	TIMEOUT_TASK 1
 
 static void
 system_taskq_init(void *arg)
 {
 	int i;
 
 	tsd_create(&taskq_tsd, NULL);
 	tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash);
 	tqenthashlock = (tqenthash + 1) / 8;
 	if (tqenthashlock > 0)
 		tqenthashlock--;
 	tqenthashtbl_lock =
 	    malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1),
 	    M_TASKQ, M_WAITOK | M_ZERO);
 	for (i = 0; i < tqenthashlock + 1; i++)
 		sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK);
 	taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri,
 	    0, 0, 0);
 	system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus,
 	    minclsyspri, 0, 0, 0);
 }
 SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init,
     NULL);
 
 static void
 system_taskq_fini(void *arg)
 {
 	int i;
 
 	taskq_destroy(system_delay_taskq);
 	taskq_destroy(system_taskq);
 	uma_zdestroy(taskq_zone);
 	tsd_destroy(&taskq_tsd);
 	for (i = 0; i < tqenthashlock + 1; i++)
 		sx_destroy(&tqenthashtbl_lock[i]);
 	for (i = 0; i < tqenthash + 1; i++)
 		VERIFY(LIST_EMPTY(&tqenthashtbl[i]));
 	free(tqenthashtbl_lock, M_TASKQ);
 	free(tqenthashtbl, M_TASKQ);
 }
 SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini,
     NULL);
 
 #ifdef __LP64__
 static taskqid_t
 __taskq_genid(void)
 {
 	taskqid_t tqid;
 
 	/*
 	 * Assume a 64-bit counter will not wrap in practice.
 	 */
 	tqid = atomic_add_64_nv(&tqidnext, 1);
 	VERIFY(tqid);
 	return (tqid);
 }
 #else
 static taskqid_t
 __taskq_genid(void)
 {
 	taskqid_t tqid;
 
 	for (;;) {
 		tqid = atomic_add_32_nv(&tqidnext, 1);
 		if (__predict_true(tqid != 0))
 			break;
 	}
 	VERIFY(tqid);
 	return (tqid);
 }
 #endif
 
 static taskq_ent_t *
 taskq_lookup(taskqid_t tqid)
 {
 	taskq_ent_t *ent = NULL;
 
 	if (tqid == 0)
 		return (NULL);
 	sx_slock(TQIDHASHLOCK(tqid));
 	LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
 		if (ent->tqent_id == tqid)
 			break;
 	}
 	if (ent != NULL)
 		refcount_acquire(&ent->tqent_rc);
 	sx_sunlock(TQIDHASHLOCK(tqid));
 	return (ent);
 }
 
 static taskqid_t
 taskq_insert(taskq_ent_t *ent)
 {
 	taskqid_t tqid = __taskq_genid();
 
 	ent->tqent_id = tqid;
 	sx_xlock(TQIDHASHLOCK(tqid));
 	LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
 	sx_xunlock(TQIDHASHLOCK(tqid));
 	return (tqid);
 }
 
 static void
 taskq_remove(taskq_ent_t *ent)
 {
 	taskqid_t tqid = ent->tqent_id;
 
 	if (tqid == 0)
 		return;
 	sx_xlock(TQIDHASHLOCK(tqid));
 	if (ent->tqent_id != 0) {
 		LIST_REMOVE(ent, tqent_hash);
 		ent->tqent_id = 0;
 	}
 	sx_xunlock(TQIDHASHLOCK(tqid));
 }
 
 static void
 taskq_tsd_set(void *context)
 {
 	taskq_t *tq = context;
 
 #if defined(__amd64__) || defined(__aarch64__) 
 	if (context != NULL && tsd_get(taskq_tsd) == NULL)
 		fpu_kern_thread(FPU_KERN_NORMAL);
 #endif
 	tsd_set(taskq_tsd, tq);
 }
 
 static taskq_t *
 taskq_create_impl(const char *name, int nthreads, pri_t pri,
     proc_t *proc __maybe_unused, uint_t flags)
 {
 	taskq_t *tq;
 
 	if ((flags & TASKQ_THREADS_CPU_PCT) != 0)
 		nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
 
 	tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
 	tq->tq_nthreads = nthreads;
 	tq->tq_queue = taskqueue_create(name, M_WAITOK,
 	    taskqueue_thread_enqueue, &tq->tq_queue);
 	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
 	    taskq_tsd_set, tq);
 	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN,
 	    taskq_tsd_set, NULL);
 	(void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri,
 	    proc, "%s", name);
 
 	return ((taskq_t *)tq);
 }
 
 taskq_t *
 taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused,
     int maxalloc __unused, uint_t flags)
 {
 	return (taskq_create_impl(name, nthreads, pri, system_proc, flags));
 }
 
 taskq_t *
 taskq_create_proc(const char *name, int nthreads, pri_t pri,
     int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags)
 {
 	return (taskq_create_impl(name, nthreads, pri, proc, flags));
 }
 
 void
 taskq_destroy(taskq_t *tq)
 {
 
 	taskqueue_free(tq->tq_queue);
 	kmem_free(tq, sizeof (*tq));
 }
 
 static void taskq_sync_assign(void *arg);
 
 typedef struct taskq_sync_arg {
 	kthread_t	*tqa_thread;
 	kcondvar_t	tqa_cv;
 	kmutex_t 	tqa_lock;
 	int		tqa_ready;
 } taskq_sync_arg_t;
 
 static void
 taskq_sync_assign(void *arg)
 {
 	taskq_sync_arg_t *tqa = arg;
 
 	mutex_enter(&tqa->tqa_lock);
 	tqa->tqa_thread = curthread;
 	tqa->tqa_ready = 1;
 	cv_signal(&tqa->tqa_cv);
 	while (tqa->tqa_ready == 1)
 		cv_wait(&tqa->tqa_cv, &tqa->tqa_lock);
 	mutex_exit(&tqa->tqa_lock);
 }
 
 /*
  * Create a taskq with a specified number of pool threads. Allocate
  * and return an array of nthreads kthread_t pointers, one for each
  * thread in the pool. The array is not ordered and must be freed
  * by the caller.
  */
 taskq_t *
 taskq_create_synced(const char *name, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
 {
 	taskq_t *tq;
 	taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP);
 	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
 	    KM_SLEEP);
 
 	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
 
 	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
 	    flags | TASKQ_PREPOPULATE);
 	VERIFY(tq != NULL);
 	VERIFY(tq->tq_nthreads == nthreads);
 
 	/* spawn all syncthreads */
 	for (int i = 0; i < nthreads; i++) {
 		cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL);
 		mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL);
 		(void) taskq_dispatch(tq, taskq_sync_assign,
 		    &tqs[i], TQ_FRONT);
 	}
 
 	/* wait on all syncthreads to start */
 	for (int i = 0; i < nthreads; i++) {
 		mutex_enter(&tqs[i].tqa_lock);
 		while (tqs[i].tqa_ready == 0)
 			cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock);
 		mutex_exit(&tqs[i].tqa_lock);
 	}
 
 	/* let all syncthreads resume, finish */
 	for (int i = 0; i < nthreads; i++) {
 		mutex_enter(&tqs[i].tqa_lock);
 		tqs[i].tqa_ready = 2;
 		cv_broadcast(&tqs[i].tqa_cv);
 		mutex_exit(&tqs[i].tqa_lock);
 	}
 	taskq_wait(tq);
 
 	for (int i = 0; i < nthreads; i++) {
 		kthreads[i] = tqs[i].tqa_thread;
 		mutex_destroy(&tqs[i].tqa_lock);
 		cv_destroy(&tqs[i].tqa_cv);
 	}
 	kmem_free(tqs, sizeof (*tqs) * nthreads);
 
 	*ktpp = kthreads;
 	return (tq);
 }
 
 int
 taskq_member(taskq_t *tq, kthread_t *thread)
 {
 
 	return (taskqueue_member(tq->tq_queue, thread));
 }
 
 taskq_t *
 taskq_of_curthread(void)
 {
 	return (tsd_get(taskq_tsd));
 }
 
 static void
 taskq_free(taskq_ent_t *task)
 {
 	taskq_remove(task);
 	if (refcount_release(&task->tqent_rc))
 		uma_zfree(taskq_zone, task);
 }
 
 int
 taskq_cancel_id(taskq_t *tq, taskqid_t tid)
 {
 	uint32_t pend;
 	int rc;
 	taskq_ent_t *ent;
 
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return (0);
 
 	if (ent->tqent_type == NORMAL_TASK) {
 		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
 		if (rc == EBUSY)
 			taskqueue_drain(tq->tq_queue, &ent->tqent_task);
 	} else {
 		rc = taskqueue_cancel_timeout(tq->tq_queue,
 		    &ent->tqent_timeout_task, &pend);
 		if (rc == EBUSY) {
 			taskqueue_drain_timeout(tq->tq_queue,
 			    &ent->tqent_timeout_task);
 		}
 	}
 	if (pend) {
 		/*
 		 * Tasks normally free themselves when run, but here the task
 		 * was cancelled so it did not free itself.
 		 */
 		taskq_free(ent);
 	}
 	/* Free the extra reference we added with taskq_lookup. */
 	taskq_free(ent);
 	return (rc);
 }
 
 static void
 taskq_run(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
 
 	if (pending == 0)
 		return;
 	task->tqent_func(task->tqent_arg);
 	taskq_free(task);
 }
 
 taskqid_t
 taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
     uint_t flags, clock_t expire_time)
 {
 	taskq_ent_t *task;
 	taskqid_t tqid;
 	clock_t timo;
 	int mflag;
 
 	timo = expire_time - ddi_get_lbolt();
 	if (timo <= 0)
 		return (taskq_dispatch(tq, func, arg, flags));
 
 	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
 		mflag = M_WAITOK;
 	else
 		mflag = M_NOWAIT;
 
 	task = uma_zalloc(taskq_zone, mflag);
 	if (task == NULL)
 		return (0);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	task->tqent_type = TIMEOUT_TASK;
 	refcount_init(&task->tqent_rc, 1);
 	tqid = taskq_insert(task);
 	TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
 	    taskq_run, task);
 
 	taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task,
 	    timo);
 	return (tqid);
 }
 
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 {
 	taskq_ent_t *task;
 	int mflag, prio;
 	taskqid_t tqid;
 
 	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
 		mflag = M_WAITOK;
 	else
 		mflag = M_NOWAIT;
 	/*
 	 * If TQ_FRONT is given, we want higher priority for this task, so it
 	 * can go at the front of the queue.
 	 */
 	prio = !!(flags & TQ_FRONT);
 
 	task = uma_zalloc(taskq_zone, mflag);
 	if (task == NULL)
 		return (0);
 	refcount_init(&task->tqent_rc, 1);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	task->tqent_type = NORMAL_TASK;
 	tqid = taskq_insert(task);
 	TASK_INIT(&task->tqent_task, prio, taskq_run, task);
 	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
 	return (tqid);
 }
 
 static void
 taskq_run_ent(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
 
 	if (pending == 0)
 		return;
 	task->tqent_func(task->tqent_arg);
 }
 
 void
 taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
     taskq_ent_t *task)
 {
 	/*
 	 * If TQ_FRONT is given, we want higher priority for this task, so it
 	 * can go at the front of the queue.
 	 */
 	task->tqent_task.ta_priority = !!(flags & TQ_FRONT);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
 }
 
 void
 taskq_init_ent(taskq_ent_t *task)
 {
 	TASK_INIT(&task->tqent_task, 0, taskq_run_ent, task);
 	task->tqent_func = NULL;
 	task->tqent_arg = NULL;
 	task->tqent_id = 0;
 	task->tqent_type = NORMAL_TASK;
 	task->tqent_rc = 0;
 }
 
 int
 taskq_empty_ent(taskq_ent_t *task)
 {
 	return (task->tqent_task.ta_pending == 0);
 }
 
 void
 taskq_wait(taskq_t *tq)
 {
 	taskqueue_quiesce(tq->tq_queue);
 }
 
 void
 taskq_wait_id(taskq_t *tq, taskqid_t tid)
 {
 	taskq_ent_t *ent;
 
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return;
 
 	if (ent->tqent_type == NORMAL_TASK)
 		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
 	else
 		taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task);
 	taskq_free(ent);
 }
 
 void
 taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused)
 {
 	taskqueue_drain_all(tq->tq_queue);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
index a07098afc5b4..d67bd7178735 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
@@ -1,279 +1,276 @@
 /*
  * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/vfs.h>
 #include <sys/priv.h>
 #include <sys/libkern.h>
 
 #include <sys/mutex.h>
 #include <sys/vnode.h>
 #include <sys/taskq.h>
 
 #include <sys/ccompat.h>
 
 MALLOC_DECLARE(M_MOUNT);
 
 void
 vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
     int flags __unused)
 {
 	struct vfsopt *opt;
 	size_t namesize;
 	int locked;
 
 	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
 		MNT_ILOCK(vfsp);
 
 	if (vfsp->mnt_opt == NULL) {
 		void *opts;
 
 		MNT_IUNLOCK(vfsp);
 		opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
 		MNT_ILOCK(vfsp);
 		if (vfsp->mnt_opt == NULL) {
 			vfsp->mnt_opt = opts;
 			TAILQ_INIT(vfsp->mnt_opt);
 		} else {
 			free(opts, M_MOUNT);
 		}
 	}
 
 	MNT_IUNLOCK(vfsp);
 
 	opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK);
 	namesize = strlen(name) + 1;
 	opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
 	strlcpy(opt->name, name, namesize);
 	opt->pos = -1;
 	opt->seen = 1;
 	if (arg == NULL) {
 		opt->value = NULL;
 		opt->len = 0;
 	} else {
 		opt->len = strlen(arg) + 1;
 		opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 		memcpy(opt->value, arg, opt->len);
 	}
 
 	MNT_ILOCK(vfsp);
 	TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
 	if (!locked)
 		MNT_IUNLOCK(vfsp);
 }
 
 void
 vfs_clearmntopt(vfs_t *vfsp, const char *name)
 {
 	int locked;
 
 	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
 		MNT_ILOCK(vfsp);
 	vfs_deleteopt(vfsp->mnt_opt, name);
 	if (!locked)
 		MNT_IUNLOCK(vfsp);
 }
 
 int
 vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
 {
 	struct vfsoptlist *opts = vfsp->mnt_optnew;
 	int error;
 
 	if (opts == NULL)
 		return (0);
 	error = vfs_getopt(opts, opt, (void **)argp, NULL);
 	return (error != 0 ? 0 : 1);
 }
 
 int
 mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
     char *fspec, int fsflags)
 {
 	struct vfsconf *vfsp;
 	struct mount *mp;
 	vnode_t *vp, *mvp;
 	int error;
 
 	ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot");
 
 	vp = *vpp;
 	*vpp = NULL;
 	error = 0;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		error = ENAMETOOLONG;
 	if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
 		error = ENODEV;
 	if (error == 0 && vp->v_type != VDIR)
 		error = ENOTDIR;
 	/*
 	 * We need vnode lock to protect v_mountedhere and vnode interlock
 	 * to protect v_iflag.
 	 */
 	if (error == 0) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
 			vp->v_iflag |= VI_MOUNT;
 		else
 			error = EBUSY;
 		VI_UNLOCK(vp);
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	vn_seqc_write_begin(vp);
 	VOP_UNLOCK1(vp);
 
 	/*
 	 * Allocate and initialize the filesystem.
 	 * We don't want regular user that triggered snapshot mount to be able
 	 * to unmount it, so pass credentials of the parent mount.
 	 */
 	mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
 
 	mp->mnt_optnew = NULL;
 	vfs_setmntopt(mp, "from", fspec, 0);
 	mp->mnt_optnew = mp->mnt_opt;
 	mp->mnt_opt = NULL;
 
 	/*
 	 * Set the mount level flags.
 	 */
 	mp->mnt_flag = fsflags & MNT_UPDATEMASK;
 	/*
 	 * Snapshots are always read-only.
 	 */
 	mp->mnt_flag |= MNT_RDONLY;
 	/*
 	 * We don't want snapshots to allow access to vulnerable setuid
 	 * programs, so we turn off setuid when mounting snapshots.
 	 */
 	mp->mnt_flag |= MNT_NOSUID;
 	/*
 	 * We don't want snapshots to be visible in regular
 	 * mount(8) and df(1) output.
 	 */
 	mp->mnt_flag |= MNT_IGNORE;
 
 	error = VFS_MOUNT(mp);
 	if (error != 0) {
 		/*
 		 * Clear VI_MOUNT and decrement the use count "atomically",
 		 * under the vnode lock.  This is not strictly required,
 		 * but makes it easier to reason about the life-cycle and
 		 * ownership of the covered vnode.
 		 */
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vn_seqc_write_end(vp);
 		vput(vp);
 		vfs_unbusy(mp);
 		vfs_freeopts(mp->mnt_optnew);
 		mp->mnt_vnodecovered = NULL;
 		vfs_mount_destroy(mp);
 		return (error);
 	}
 
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	(void) VFS_STATFS(mp, &mp->mnt_stat);
 
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
 #endif
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 #ifdef VIRF_MOUNTPOINT
 	vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
 #endif
 	vp->v_mountedhere = mp;
 	VI_UNLOCK(vp);
 	/* Put the new filesystem on the mount list. */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_event_signal(NULL, VQ_MOUNT, 0);
 	if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
 		panic("mount: lost mount");
 	vn_seqc_write_end(vp);
 	VOP_UNLOCK1(vp);
 #if __FreeBSD_version >= 1300048
 	vfs_op_exit(mp);
 #endif
 	vfs_unbusy(mp);
 	*vpp = mvp;
 	return (0);
 }
 
 /*
  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
  * the file system as a result of releasing the vnode. Note, file systems
  * already have to handle the race where the vnode is incremented before the
  * inactive routine is called and does its locking.
  *
  * Warning: Excessive use of this routine can lead to performance problems.
  * This is because taskqs throttle back allocation if too many are created.
  */
 void
 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 {
 	VERIFY3U(vp->v_usecount, >, 0);
 	if (refcount_release_if_not_last(&vp->v_usecount)) {
 #if __FreeBSD_version < 1300045
 		vdrop(vp);
 #endif
 		return;
 	}
 	VERIFY3U(taskq_dispatch((taskq_t *)taskq,
 	    (task_func_t *)vrele, vp, TQ_SLEEP), !=, 0);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
index 739ddb05e895..e6f019cb9a46 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
@@ -1,75 +1,72 @@
 /*
  * Copyright (c) 2013 EMC Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/counter.h>
 
 #include <sys/byteorder.h>
 #include <sys/lock.h>
 #include <sys/freebsd_rwlock.h>
 #include <sys/vm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
 const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
 const int zfs_vm_pagerret_ok = VM_PAGER_OK;
 const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
 const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
 
 void
 zfs_vmobject_assert_wlocked(vm_object_t object)
 {
 
 	/*
 	 * This is not ideal because FILE/LINE used by assertions will not
 	 * be too helpful, but it must be an hard function for
 	 * compatibility reasons.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(object);
 }
 
 void
 zfs_vmobject_wlock(vm_object_t object)
 {
 
 	VM_OBJECT_WLOCK(object);
 }
 
 void
 zfs_vmobject_wunlock(vm_object_t object)
 {
 
 	VM_OBJECT_WUNLOCK(object);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
index 8bd3bdedf268..6cfea889a272 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
@@ -1,237 +1,234 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/zmod.h>
 #if __FreeBSD_version >= 1300041
 #include <contrib/zlib/zlib.h>
 #else
 #include <sys/zlib.h>
 #endif
 #include <sys/kobj.h>
 
 
 static void *
 zcalloc(void *opaque, uint_t items, uint_t size)
 {
 	(void) opaque;
 	return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT));
 }
 
 static void
 zcfree(void *opaque, void *ptr)
 {
 	(void) opaque;
 	free(ptr, M_SOLARIS);
 }
 
 static int
 zlib_deflateInit(z_stream *stream, int level)
 {
 
 	stream->zalloc = zcalloc;
 	stream->opaque = NULL;
 	stream->zfree = zcfree;
 
 	return (deflateInit(stream, level));
 }
 
 static int
 zlib_deflate(z_stream *stream, int flush)
 {
 	return (deflate(stream, flush));
 }
 
 static int
 zlib_deflateEnd(z_stream *stream)
 {
 	return (deflateEnd(stream));
 }
 
 static int
 zlib_inflateInit(z_stream *stream)
 {
 	stream->zalloc = zcalloc;
 	stream->opaque = NULL;
 	stream->zfree = zcfree;
 
 	return (inflateInit(stream));
 }
 
 static int
 zlib_inflate(z_stream *stream, int finish)
 {
 #if __FreeBSD_version >= 1300024
 	return (inflate(stream, finish));
 #else
 	return (_zlib104_inflate(stream, finish));
 #endif
 }
 
 
 static int
 zlib_inflateEnd(z_stream *stream)
 {
 	return (inflateEnd(stream));
 }
 
 /*
  * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
  * and vfree for every call.  Using a kmem_cache also has the advantage
  * that improves the odds that the memory used will be local to this cpu.
  * To further improve things it might be wise to create a dedicated per-cpu
  * workspace for use.  This would take some additional care because we then
  * must disable preemption around the critical section, and verify that
  * zlib_deflate* and zlib_inflate* never internally call schedule().
  */
 static void *
 zlib_workspace_alloc(int flags)
 {
 	// return (kmem_cache_alloc(zlib_workspace_cache, flags));
 	return (NULL);
 }
 
 static void
 zlib_workspace_free(void *workspace)
 {
 	// kmem_cache_free(zlib_workspace_cache, workspace);
 }
 
 /*
  * Compresses the source buffer into the destination buffer. The level
  * parameter has the same meaning as in deflateInit.  sourceLen is the byte
  * length of the source buffer. Upon entry, destLen is the total size of the
  * destination buffer, which must be at least 0.1% larger than sourceLen plus
  * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
  *
  * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
  * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
  * Z_STREAM_ERROR if the level parameter is invalid.
  */
 int
 z_compress_level(void *dest, size_t *destLen, const void *source,
     size_t sourceLen, int level)
 {
 	z_stream stream = {0};
 	int err;
 
 	stream.next_in = (Byte *)source;
 	stream.avail_in = (uInt)sourceLen;
 	stream.next_out = dest;
 	stream.avail_out = (uInt)*destLen;
 	stream.opaque = NULL;
 
 	if ((size_t)stream.avail_out != *destLen)
 		return (Z_BUF_ERROR);
 
 	stream.opaque = zlib_workspace_alloc(KM_SLEEP);
 #if 0
 	if (!stream.opaque)
 		return (Z_MEM_ERROR);
 #endif
 	err = zlib_deflateInit(&stream, level);
 	if (err != Z_OK) {
 		zlib_workspace_free(stream.opaque);
 		return (err);
 	}
 
 	err = zlib_deflate(&stream, Z_FINISH);
 	if (err != Z_STREAM_END) {
 		zlib_deflateEnd(&stream);
 		zlib_workspace_free(stream.opaque);
 		return (err == Z_OK ? Z_BUF_ERROR : err);
 	}
 	*destLen = stream.total_out;
 
 	err = zlib_deflateEnd(&stream);
 	zlib_workspace_free(stream.opaque);
 	return (err);
 }
 
 /*
  * Decompresses the source buffer into the destination buffer.  sourceLen is
  * the byte length of the source buffer. Upon entry, destLen is the total
  * size of the destination buffer, which must be large enough to hold the
  * entire uncompressed data. (The size of the uncompressed data must have
  * been saved previously by the compressor and transmitted to the decompressor
  * by some mechanism outside the scope of this compression library.)
  * Upon exit, destLen is the actual size of the compressed buffer.
  * This function can be used to decompress a whole file at once if the
  * input file is mmap'ed.
  *
  * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
  * enough memory, Z_BUF_ERROR if there was not enough room in the output
  * buffer, or Z_DATA_ERROR if the input data was corrupted.
  */
 int
 z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
 {
 	z_stream stream = {0};
 	int err;
 
 	stream.next_in = (Byte *)source;
 	stream.avail_in = (uInt)sourceLen;
 	stream.next_out = dest;
 	stream.avail_out = (uInt)*destLen;
 
 	if ((size_t)stream.avail_out != *destLen)
 		return (Z_BUF_ERROR);
 
 	stream.opaque = zlib_workspace_alloc(KM_SLEEP);
 #if 0
 	if (!stream.opaque)
 		return (Z_MEM_ERROR);
 #endif
 	err = zlib_inflateInit(&stream);
 	if (err != Z_OK) {
 		zlib_workspace_free(stream.opaque);
 		return (err);
 	}
 
 	err = zlib_inflate(&stream, Z_FINISH);
 	if (err != Z_STREAM_END) {
 		zlib_inflateEnd(&stream);
 		zlib_workspace_free(stream.opaque);
 
 		if (err == Z_NEED_DICT ||
 		    (err == Z_BUF_ERROR && stream.avail_in == 0))
 			return (Z_DATA_ERROR);
 
 		return (err);
 	}
 	*destLen = stream.total_out;
 
 	err = zlib_inflateEnd(&stream);
 	zlib_workspace_free(stream.opaque);
 
 	return (err);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
index 658ef0bf056d..7f2b5c712c42 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
@@ -1,260 +1,257 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/jail.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/zone.h>
 
 #include <sys/policy.h>
 
 static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data");
 
 /*
  * Structure to record list of ZFS datasets exported to a zone.
  */
 typedef struct zone_dataset {
 	LIST_ENTRY(zone_dataset) zd_next;
 	char	zd_dataset[0];
 } zone_dataset_t;
 
 LIST_HEAD(zone_dataset_head, zone_dataset);
 
 static int zone_slot;
 
 int
 zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
 {
 	struct zone_dataset_head *head;
 	zone_dataset_t *zd, *zd2;
 	struct prison *pr;
 	int dofree, error;
 
 	if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
 		return (error);
 
 	/* Allocate memory before we grab prison's mutex. */
 	zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK);
 
 	sx_slock(&allprison_lock);
 	pr = prison_find(jailid);	/* Locks &pr->pr_mtx. */
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL) {
 		free(zd, M_ZONES);
 		return (ENOENT);
 	}
 
 	head = osd_jail_get(pr, zone_slot);
 	if (head != NULL) {
 		dofree = 0;
 		LIST_FOREACH(zd2, head, zd_next) {
 			if (strcmp(dataset, zd2->zd_dataset) == 0) {
 				free(zd, M_ZONES);
 				error = EEXIST;
 				goto end;
 			}
 		}
 	} else {
 		dofree = 1;
 		prison_hold_locked(pr);
 		mtx_unlock(&pr->pr_mtx);
 		head = malloc(sizeof (*head), M_ZONES, M_WAITOK);
 		LIST_INIT(head);
 		mtx_lock(&pr->pr_mtx);
 		error = osd_jail_set(pr, zone_slot, head);
 		KASSERT(error == 0, ("osd_jail_set() failed (error=%d)",
 		    error));
 	}
 	strcpy(zd->zd_dataset, dataset);
 	LIST_INSERT_HEAD(head, zd, zd_next);
 end:
 	if (dofree)
 		prison_free_locked(pr);
 	else
 		mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 
 int
 zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid)
 {
 	struct zone_dataset_head *head;
 	zone_dataset_t *zd;
 	struct prison *pr;
 	int error;
 
 	if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
 		return (error);
 
 	sx_slock(&allprison_lock);
 	pr = prison_find(jailid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENOENT);
 	head = osd_jail_get(pr, zone_slot);
 	if (head == NULL) {
 		error = ENOENT;
 		goto end;
 	}
 	LIST_FOREACH(zd, head, zd_next) {
 		if (strcmp(dataset, zd->zd_dataset) == 0)
 			break;
 	}
 	if (zd == NULL)
 		error = ENOENT;
 	else {
 		LIST_REMOVE(zd, zd_next);
 		free(zd, M_ZONES);
 		if (LIST_EMPTY(head))
 			osd_jail_del(pr, zone_slot);
 		error = 0;
 	}
 end:
 	mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 
 /*
  * Returns true if the named dataset is visible in the current zone.
  * The 'write' parameter is set to 1 if the dataset is also writable.
  */
 int
 zone_dataset_visible(const char *dataset, int *write)
 {
 	struct zone_dataset_head *head;
 	zone_dataset_t *zd;
 	struct prison *pr;
 	size_t len;
 	int ret = 0;
 
 	if (dataset[0] == '\0')
 		return (0);
 	if (INGLOBALZONE(curproc)) {
 		if (write != NULL)
 			*write = 1;
 		return (1);
 	}
 	pr = curthread->td_ucred->cr_prison;
 	mtx_lock(&pr->pr_mtx);
 	head = osd_jail_get(pr, zone_slot);
 	if (head == NULL)
 		goto end;
 
 	/*
 	 * Walk the list once, looking for datasets which match exactly, or
 	 * specify a dataset underneath an exported dataset.  If found, return
 	 * true and note that it is writable.
 	 */
 	LIST_FOREACH(zd, head, zd_next) {
 		len = strlen(zd->zd_dataset);
 		if (strlen(dataset) >= len &&
 		    memcmp(dataset, zd->zd_dataset, len) == 0 &&
 		    (dataset[len] == '\0' || dataset[len] == '/' ||
 		    dataset[len] == '@')) {
 			if (write)
 				*write = 1;
 			ret = 1;
 			goto end;
 		}
 	}
 
 	/*
 	 * Walk the list a second time, searching for datasets which are parents
 	 * of exported datasets.  These should be visible, but read-only.
 	 *
 	 * Note that we also have to support forms such as 'pool/dataset/', with
 	 * a trailing slash.
 	 */
 	LIST_FOREACH(zd, head, zd_next) {
 		len = strlen(dataset);
 		if (dataset[len - 1] == '/')
 			len--;	/* Ignore trailing slash */
 		if (len < strlen(zd->zd_dataset) &&
 		    memcmp(dataset, zd->zd_dataset, len) == 0 &&
 		    zd->zd_dataset[len] == '/') {
 			if (write)
 				*write = 0;
 			ret = 1;
 			goto end;
 		}
 	}
 end:
 	mtx_unlock(&pr->pr_mtx);
 	return (ret);
 }
 
 static void
 zone_destroy(void *arg)
 {
 	struct zone_dataset_head *head;
 	zone_dataset_t *zd;
 
 	head = arg;
 	while ((zd = LIST_FIRST(head)) != NULL) {
 		LIST_REMOVE(zd, zd_next);
 		free(zd, M_ZONES);
 	}
 	free(head, M_ZONES);
 }
 
 uint32_t
 zone_get_hostid(void *ptr)
 {
 
 	KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__));
 
 	return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid);
 }
 
 static void
 zone_sysinit(void *arg __unused)
 {
 
 	zone_slot = osd_jail_register(zone_destroy, NULL);
 }
 
 static void
 zone_sysuninit(void *arg __unused)
 {
 
 	osd_jail_deregister(zone_slot);
 }
 
 SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL);
 SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
index 1f139ea5b807..ed8d2407613e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
@@ -1,635 +1,632 @@
 /*
  * Copyright (c) 2005-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * Copyright (c) 2018 Sean Eric Fagan <sef@ixsystems.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/errno.h>
 
 #ifdef _KERNEL
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 #endif
 
 #include <sys/zio_crypt.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 
 #include <sys/freebsd_crypto.h>
 
 #define	SHA512_HMAC_BLOCK_SIZE	128
 
 static int crypt_sessions = 0;
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD,
 	&crypt_sessions, 0, "Number of cryptographic sessions created");
 
 void
 crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key)
 {
 	uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE],
 	    k_opad[SHA512_HMAC_BLOCK_SIZE],
 	    key[SHA512_HMAC_BLOCK_SIZE];
 	SHA512_CTX lctx;
 	int i;
 	size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length);
 
 	/*
 	 * This code is based on the similar code in geom/eli/g_eli_hmac.c
 	 */
 	memset(key, 0, sizeof (key));
 	if (c_key->ck_length  == 0)
 		/* do nothing */;
 	else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE)
 		memcpy(key, c_key->ck_data, cl_bytes);
 	else {
 		/*
 		 * If key is longer than 128 bytes reset it to
 		 * key = SHA512(key).
 		 */
 		SHA512_Init(&lctx);
 		SHA512_Update(&lctx, c_key->ck_data, cl_bytes);
 		SHA512_Final(key, &lctx);
 	}
 
 	/* XOR key with ipad and opad values. */
 	for (i = 0; i < sizeof (key); i++) {
 		k_ipad[i] = key[i] ^ 0x36;
 		k_opad[i] = key[i] ^ 0x5c;
 	}
 	memset(key, 0, sizeof (key));
 
 	/* Start inner SHA512. */
 	SHA512_Init(&ctx->innerctx);
 	SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad));
 	memset(k_ipad, 0, sizeof (k_ipad));
 	/* Start outer SHA512. */
 	SHA512_Init(&ctx->outerctx);
 	SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad));
 	memset(k_opad, 0, sizeof (k_opad));
 }
 
 void
 crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize)
 {
 	SHA512_Update(&ctx->innerctx, data, datasize);
 }
 
 void
 crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize)
 {
 	uint8_t digest[SHA512_DIGEST_LENGTH];
 
 	/* Complete inner hash */
 	SHA512_Final(digest, &ctx->innerctx);
 
 	/* Complete outer hash */
 	SHA512_Update(&ctx->outerctx, digest, sizeof (digest));
 	SHA512_Final(digest, &ctx->outerctx);
 
 	memset(ctx, 0, sizeof (*ctx));
 	/* mdsize == 0 means "Give me the whole hash!" */
 	if (mdsize == 0)
 		mdsize = SHA512_DIGEST_LENGTH;
 	memcpy(md, digest, mdsize);
 	memset(digest, 0, sizeof (digest));
 }
 
 void
 crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size,
     void *out_data, size_t out_data_size)
 {
 	struct hmac_ctx ctx;
 
 	crypto_mac_init(&ctx, key);
 	crypto_mac_update(&ctx, in_data, in_data_size);
 	crypto_mac_final(&ctx, out_data, out_data_size);
 }
 
 static int
 freebsd_zfs_crypt_done(struct cryptop *crp)
 {
 	freebsd_crypt_session_t *ses;
 
 	ses = crp->crp_opaque;
 	mtx_lock(&ses->fs_lock);
 	ses->fs_done = true;
 	mtx_unlock(&ses->fs_lock);
 	wakeup(crp);
 	return (0);
 }
 
 static int
 freebsd_zfs_crypt_done_sync(struct cryptop *crp)
 {
 
 	return (0);
 }
 
 void
 freebsd_crypt_freesession(freebsd_crypt_session_t *sess)
 {
 	mtx_destroy(&sess->fs_lock);
 	crypto_freesession(sess->fs_sid);
 	memset(sess, 0, sizeof (*sess));
 }
 
 static int
 zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp)
 {
 	int error;
 
 	crp->crp_opaque = session;
 	for (;;) {
 #if __FreeBSD_version < 1400004
 		boolean_t async = ((crypto_ses2caps(crp->crp_session) &
 		    CRYPTOCAP_F_SYNC) == 0);
 #else
 		boolean_t async = !CRYPTO_SESS_SYNC(crp->crp_session);
 #endif
 		crp->crp_callback = async ? freebsd_zfs_crypt_done :
 		    freebsd_zfs_crypt_done_sync;
 		error = crypto_dispatch(crp);
 		if (error == 0) {
 			if (async) {
 				mtx_lock(&session->fs_lock);
 				while (session->fs_done == false) {
 					msleep(crp, &session->fs_lock, 0,
 					    "zfs_crypto", 0);
 				}
 				mtx_unlock(&session->fs_lock);
 			}
 			error = crp->crp_etype;
 		}
 
 		if (error == ENOMEM) {
 			pause("zcrnomem", 1);
 		} else if (error != EAGAIN) {
 			break;
 		}
 		crp->crp_etype = 0;
 		crp->crp_flags &= ~CRYPTO_F_DONE;
 		session->fs_done = false;
 #if __FreeBSD_version < 1300087
 		/*
 		 * Session ID changed, so we should record that,
 		 * and try again
 		 */
 		session->fs_sid = crp->crp_session;
 #endif
 	}
 	return (error);
 }
 static void
 freebsd_crypt_uio_debug_log(boolean_t encrypt,
     freebsd_crypt_session_t *input_sessionp,
     const struct zio_crypt_info *c_info,
     zfs_uio_t *data_uio,
     crypto_key_t *key,
     uint8_t *ivbuf,
     size_t datalen,
     size_t auth_len)
 {
 #ifdef FCRYPTO_DEBUG
 	struct cryptodesc *crd;
 	uint8_t *p = NULL;
 	size_t total = 0;
 
 	printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %p, %u }, "
 	    "%p, %u, %u)\n",
 	    __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp,
 	    c_info->ci_algname, c_info->ci_crypt_type,
 	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
 	    data_uio, key->ck_data,
 	    (unsigned int)key->ck_length,
 	    ivbuf, (unsigned int)datalen, (unsigned int)auth_len);
 	printf("\tkey = { ");
 	for (int i = 0; i < key->ck_length / 8; i++) {
 		uint8_t *b = (uint8_t *)key->ck_data;
 		printf("%02x ", b[i]);
 	}
 	printf("}\n");
 	for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++) {
 		printf("\tiovec #%d: <%p, %u>\n", i,
 		    zfs_uio_iovbase(data_uio, i),
 		    (unsigned int)zfs_uio_iovlen(data_uio, i));
 		total += zfs_uio_iovlen(data_uio, i);
 	}
 	zfs_uio_resid(data_uio) = total;
 #endif
 }
 /*
  * Create a new cryptographic session.  This should
  * happen every time the key changes (including when
  * it's first loaded).
  */
 #if __FreeBSD_version >= 1300087
 int
 freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
     const struct zio_crypt_info *c_info, crypto_key_t *key)
 {
 	struct crypto_session_params csp = {0};
 	int error = 0;
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%p, { %s, %d, %d, %s }, { %p, %u })\n",
 	    __FUNCTION__, sessp,
 	    c_info->ci_algname, c_info->ci_crypt_type,
 	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
 	    key->ck_data, (unsigned int)key->ck_length);
 	printf("\tkey = { ");
 	for (int i = 0; i < key->ck_length / 8; i++) {
 		uint8_t *b = (uint8_t *)key->ck_data;
 		printf("%02x ", b[i]);
 	}
 	printf("}\n");
 #endif
 	csp.csp_mode = CSP_MODE_AEAD;
 	csp.csp_cipher_key = key->ck_data;
 	csp.csp_cipher_klen = key->ck_length / 8;
 	switch (c_info->ci_crypt_type) {
 		case ZC_TYPE_GCM:
 		csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16;
 		csp.csp_ivlen = AES_GCM_IV_LEN;
 		switch (key->ck_length/8) {
 		case AES_128_GMAC_KEY_LEN:
 		case AES_192_GMAC_KEY_LEN:
 		case AES_256_GMAC_KEY_LEN:
 			break;
 		default:
 			error = EINVAL;
 			goto bad;
 		}
 		break;
 	case ZC_TYPE_CCM:
 		csp.csp_cipher_alg = CRYPTO_AES_CCM_16;
 		csp.csp_ivlen = AES_CCM_IV_LEN;
 		switch (key->ck_length/8) {
 		case AES_128_CBC_MAC_KEY_LEN:
 		case AES_192_CBC_MAC_KEY_LEN:
 		case AES_256_CBC_MAC_KEY_LEN:
 			break;
 		default:
 			error = EINVAL;
 			goto bad;
 			break;
 		}
 		break;
 	default:
 		error = ENOTSUP;
 		goto bad;
 	}
 
 	/*
 	 * Disable the use of hardware drivers on FreeBSD 13 and later since
 	 * common crypto offload drivers impose constraints on AES-GCM AAD
 	 * lengths that make them unusable for ZFS, and we currently do not have
 	 * a mechanism to fall back to a software driver for requests not
 	 * handled by a hardware driver.
 	 *
 	 * On 12 we continue to permit the use of hardware drivers since
 	 * CPU-accelerated drivers such as aesni(4) register themselves as
 	 * hardware drivers.
 	 */
 	error = crypto_newsession(&sessp->fs_sid, &csp, CRYPTOCAP_F_SOFTWARE);
 	mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
 	    NULL, MTX_DEF);
 	crypt_sessions++;
 bad:
 #ifdef FCRYPTO_DEBUG
 	if (error)
 		printf("%s: returning error %d\n", __FUNCTION__, error);
 #endif
 	return (error);
 }
 
 int
 freebsd_crypt_uio(boolean_t encrypt,
     freebsd_crypt_session_t *input_sessionp,
     const struct zio_crypt_info *c_info,
     zfs_uio_t *data_uio,
     crypto_key_t *key,
     uint8_t *ivbuf,
     size_t datalen,
     size_t auth_len)
 {
 	struct cryptop *crp;
 	freebsd_crypt_session_t *session = NULL;
 	int error = 0;
 	size_t total = 0;
 
 	freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
 	    key, ivbuf, datalen, auth_len);
 	for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++)
 		total += zfs_uio_iovlen(data_uio, i);
 	zfs_uio_resid(data_uio) = total;
 	if (input_sessionp == NULL) {
 		session = kmem_zalloc(sizeof (*session), KM_SLEEP);
 		error = freebsd_crypt_newsession(session, c_info, key);
 		if (error)
 			goto out;
 	} else
 		session = input_sessionp;
 
 	crp = crypto_getreq(session->fs_sid, M_WAITOK);
 	if (encrypt) {
 		crp->crp_op = CRYPTO_OP_ENCRYPT |
 		    CRYPTO_OP_COMPUTE_DIGEST;
 	} else {
 		crp->crp_op = CRYPTO_OP_DECRYPT |
 		    CRYPTO_OP_VERIFY_DIGEST;
 	}
 	crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE;
 	crypto_use_uio(crp, GET_UIO_STRUCT(data_uio));
 
 	crp->crp_aad_start = 0;
 	crp->crp_aad_length = auth_len;
 	crp->crp_payload_start = auth_len;
 	crp->crp_payload_length = datalen;
 	crp->crp_digest_start = auth_len + datalen;
 
 	memcpy(crp->crp_iv, ivbuf, ZIO_DATA_IV_LEN);
 	error = zfs_crypto_dispatch(session, crp);
 	crypto_freereq(crp);
 out:
 #ifdef FCRYPTO_DEBUG
 	if (error)
 		printf("%s: returning error %d\n", __FUNCTION__, error);
 #endif
 	if (input_sessionp == NULL) {
 		freebsd_crypt_freesession(session);
 		kmem_free(session, sizeof (*session));
 	}
 	return (error);
 }
 
 #else
 int
 freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
     const struct zio_crypt_info *c_info, crypto_key_t *key)
 {
 	struct cryptoini cria = {0}, crie = {0}, *crip;
 	struct enc_xform *xform;
 	struct auth_hash *xauth;
 	int error = 0;
 	crypto_session_t sid;
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%p, { %s, %d, %d, %s }, { %p, %u })\n",
 	    __FUNCTION__, sessp,
 	    c_info->ci_algname, c_info->ci_crypt_type,
 	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
 	    key->ck_data, (unsigned int)key->ck_length);
 	printf("\tkey = { ");
 	for (int i = 0; i < key->ck_length / 8; i++) {
 		uint8_t *b = (uint8_t *)key->ck_data;
 		printf("%02x ", b[i]);
 	}
 	printf("}\n");
 #endif
 	switch (c_info->ci_crypt_type) {
 	case ZC_TYPE_GCM:
 		xform = &enc_xform_aes_nist_gcm;
 		switch (key->ck_length/8) {
 		case AES_128_GMAC_KEY_LEN:
 			xauth = &auth_hash_nist_gmac_aes_128;
 			break;
 		case AES_192_GMAC_KEY_LEN:
 			xauth = &auth_hash_nist_gmac_aes_192;
 			break;
 		case AES_256_GMAC_KEY_LEN:
 			xauth = &auth_hash_nist_gmac_aes_256;
 			break;
 		default:
 			error = EINVAL;
 			goto bad;
 		}
 		break;
 	case ZC_TYPE_CCM:
 		xform = &enc_xform_ccm;
 		switch (key->ck_length/8) {
 		case AES_128_CBC_MAC_KEY_LEN:
 			xauth = &auth_hash_ccm_cbc_mac_128;
 			break;
 		case AES_192_CBC_MAC_KEY_LEN:
 			xauth = &auth_hash_ccm_cbc_mac_192;
 			break;
 		case AES_256_CBC_MAC_KEY_LEN:
 			xauth = &auth_hash_ccm_cbc_mac_256;
 			break;
 		default:
 			error = EINVAL;
 			goto bad;
 			break;
 		}
 		break;
 	default:
 		error = ENOTSUP;
 		goto bad;
 	}
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
 	    "auth %s (key length %d)\n",
 	    __FUNCTION__, __LINE__,
 	    xform->name, (unsigned int)key->ck_length,
 	    (unsigned int)key->ck_length/8,
 	    xauth->name, xauth->keysize);
 #endif
 
 	crie.cri_alg = xform->type;
 	crie.cri_key = key->ck_data;
 	crie.cri_klen = key->ck_length;
 
 	cria.cri_alg = xauth->type;
 	cria.cri_key = key->ck_data;
 	cria.cri_klen = key->ck_length;
 
 	cria.cri_next = &crie;
 	crie.cri_next = NULL;
 	crip = &cria;
 	// Everything else is zero-initialised
 
 	error = crypto_newsession(&sid, crip,
 	    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
 	if (error != 0) {
 		printf("%s(%d):  crypto_newsession failed with %d\n",
 		    __FUNCTION__, __LINE__, error);
 		goto bad;
 	}
 	sessp->fs_sid = sid;
 	mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
 	    NULL, MTX_DEF);
 	crypt_sessions++;
 bad:
 	return (error);
 }
 
 /*
  * The meat of encryption/decryption.
  * If sessp is NULL, then it will create a
  * temporary cryptographic session, and release
  * it when done.
  */
 int
 freebsd_crypt_uio(boolean_t encrypt,
     freebsd_crypt_session_t *input_sessionp,
     const struct zio_crypt_info *c_info,
     zfs_uio_t *data_uio,
     crypto_key_t *key,
     uint8_t *ivbuf,
     size_t datalen,
     size_t auth_len)
 {
 	struct cryptop *crp;
 	struct cryptodesc *enc_desc, *auth_desc;
 	struct enc_xform *xform;
 	struct auth_hash *xauth;
 	freebsd_crypt_session_t *session = NULL;
 	int error;
 
 	freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
 	    key, ivbuf, datalen, auth_len);
 	switch (c_info->ci_crypt_type) {
 	case ZC_TYPE_GCM:
 		xform = &enc_xform_aes_nist_gcm;
 		switch (key->ck_length/8) {
 		case AES_128_GMAC_KEY_LEN:
 			xauth = &auth_hash_nist_gmac_aes_128;
 			break;
 		case AES_192_GMAC_KEY_LEN:
 			xauth = &auth_hash_nist_gmac_aes_192;
 			break;
 		case AES_256_GMAC_KEY_LEN:
 			xauth = &auth_hash_nist_gmac_aes_256;
 			break;
 		default:
 			error = EINVAL;
 			goto bad;
 		}
 		break;
 	case ZC_TYPE_CCM:
 		xform = &enc_xform_ccm;
 		switch (key->ck_length/8) {
 		case AES_128_CBC_MAC_KEY_LEN:
 			xauth = &auth_hash_ccm_cbc_mac_128;
 			break;
 		case AES_192_CBC_MAC_KEY_LEN:
 			xauth = &auth_hash_ccm_cbc_mac_192;
 			break;
 		case AES_256_CBC_MAC_KEY_LEN:
 			xauth = &auth_hash_ccm_cbc_mac_256;
 			break;
 		default:
 			error = EINVAL;
 			goto bad;
 			break;
 		}
 		break;
 	default:
 		error = ENOTSUP;
 		goto bad;
 	}
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
 	    "auth %s (key length %d)\n",
 	    __FUNCTION__, __LINE__,
 	    xform->name, (unsigned int)key->ck_length,
 	    (unsigned int)key->ck_length/8,
 	    xauth->name, xauth->keysize);
 #endif
 
 	if (input_sessionp == NULL) {
 		session = kmem_zalloc(sizeof (*session), KM_SLEEP);
 		error = freebsd_crypt_newsession(session, c_info, key);
 		if (error)
 			goto out;
 	} else
 		session = input_sessionp;
 
 	crp = crypto_getreq(2);
 	if (crp == NULL) {
 		error = ENOMEM;
 		goto bad;
 	}
 
 	auth_desc = crp->crp_desc;
 	enc_desc = auth_desc->crd_next;
 
 	crp->crp_session = session->fs_sid;
 	crp->crp_ilen = auth_len + datalen;
 	crp->crp_buf = (void*)GET_UIO_STRUCT(data_uio);
 	crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC;
 
 	auth_desc->crd_skip = 0;
 	auth_desc->crd_len = auth_len;
 	auth_desc->crd_inject = auth_len + datalen;
 	auth_desc->crd_alg = xauth->type;
 #ifdef FCRYPTO_DEBUG
 	printf("%s: auth: skip = %u, len = %u, inject = %u\n",
 	    __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len,
 	    auth_desc->crd_inject);
 #endif
 
 	enc_desc->crd_skip = auth_len;
 	enc_desc->crd_len = datalen;
 	enc_desc->crd_inject = auth_len;
 	enc_desc->crd_alg = xform->type;
 	enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT;
 	memcpy(enc_desc->crd_iv, ivbuf, ZIO_DATA_IV_LEN);
 	enc_desc->crd_next = NULL;
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s: enc: skip = %u, len = %u, inject = %u\n",
 	    __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len,
 	    enc_desc->crd_inject);
 #endif
 
 	if (encrypt)
 		enc_desc->crd_flags |= CRD_F_ENCRYPT;
 
 	error = zfs_crypto_dispatch(session, crp);
 	crypto_freereq(crp);
 out:
 	if (input_sessionp == NULL) {
 		freebsd_crypt_freesession(session);
 		kmem_free(session, sizeof (*session));
 	}
 bad:
 #ifdef FCRYPTO_DEBUG
 	if (error)
 		printf("%s: returning error %d\n", __FUNCTION__, error);
 #endif
 	return (error);
 }
 #endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
index a5f486b95db4..ee6fb2dc657b 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
@@ -1,333 +1,330 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/zfs_rlock.h>
 #include <sys/racct.h>
 #include <sys/vm.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vnops.h>
 
 #include <sys/ccompat.h>
 
 #ifndef IDX_TO_OFF
 #define	IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #endif
 
 #if  __FreeBSD_version < 1300051
 #define	VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY
 #else
 #define	VM_ALLOC_BUSY_FLAGS  VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY
 #endif
 
 
 #if __FreeBSD_version < 1300072
 #define	dmu_page_lock(m)	vm_page_lock(m)
 #define	dmu_page_unlock(m)	vm_page_unlock(m)
 #else
 #define	dmu_page_lock(m)
 #define	dmu_page_unlock(m)
 #endif
 
 int
 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     vm_page_t *ma, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	struct sf_buf *sf;
 	int numbufs, i;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy, copied, thiscpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 		caddr_t va;
 
 		ASSERT3U(size, >, 0);
 		ASSERT3U(db->db_size, >=, PAGESIZE);
 
 		bufoff = offset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
 			ASSERT3U(ptoa((*ma)->pindex), ==,
 			    db->db_offset + bufoff);
 			thiscpy = MIN(PAGESIZE, tocpy - copied);
 			va = zfs_map_page(*ma, &sf);
 			memcpy((char *)db->db_data + bufoff, va, thiscpy);
 			zfs_unmap_page(sf);
 			ma += 1;
 			bufoff += PAGESIZE;
 		}
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
 int
 dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
     int *rbehind, int *rahead, int last_size)
 {
 	struct sf_buf *sf;
 	vm_object_t vmobj;
 	vm_page_t m;
 	dmu_buf_t **dbp;
 	dmu_buf_t *db;
 	caddr_t va;
 	int numbufs, i;
 	int bufoff, pgoff, tocpy;
 	int mi, di;
 	int err;
 
 	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
 	ASSERT3S(last_size, <=, PAGE_SIZE);
 
 	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
 	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
 	if (err != 0)
 		return (err);
 
 #ifdef ZFS_DEBUG
 	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
 	if (dbp[0]->db_offset != 0 || numbufs > 1) {
 		for (i = 0; i < numbufs; i++) {
 			ASSERT(ISP2(dbp[i]->db_size));
 			ASSERT3U((dbp[i]->db_offset % dbp[i]->db_size), ==, 0);
 			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
 		}
 	}
 #endif
 
 	vmobj = ma[0]->object;
 	zfs_vmobject_wlock_12(vmobj);
 
 	db = dbp[0];
 	for (i = 0; i < *rbehind; i++) {
 		m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
 		if (m == NULL)
 			break;
 		if (!vm_page_none_valid(m)) {
 			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_do_sunbusy(m);
 			break;
 		}
 		ASSERT3U(m->dirty, ==, 0);
 		ASSERT(!pmap_page_is_write_mapped(m));
 
 		ASSERT3U(db->db_size, >, PAGE_SIZE);
 		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
 		va = zfs_map_page(m, &sf);
 		memcpy(va, (char *)db->db_data + bufoff, PAGESIZE);
 		zfs_unmap_page(sf);
 		vm_page_valid(m);
 		dmu_page_lock(m);
 		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
 			vm_page_activate(m);
 		else
 			vm_page_deactivate(m);
 		dmu_page_unlock(m);
 		vm_page_do_sunbusy(m);
 	}
 	*rbehind = i;
 
 	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
 	pgoff = 0;
 	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
 		if (pgoff == 0) {
 			m = ma[mi];
 			if (m != bogus_page) {
 				vm_page_assert_xbusied(m);
 				ASSERT(vm_page_none_valid(m));
 				ASSERT3U(m->dirty, ==, 0);
 				ASSERT(!pmap_page_is_write_mapped(m));
 				va = zfs_map_page(m, &sf);
 			}
 		}
 		if (bufoff == 0)
 			db = dbp[di];
 
 		if (m != bogus_page) {
 			ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
 			    db->db_offset + bufoff);
 		}
 
 		/*
 		 * We do not need to clamp the copy size by the file
 		 * size as the last block is zero-filled beyond the
 		 * end of file anyway.
 		 */
 		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
 		ASSERT3S(tocpy, >=, 0);
 		if (m != bogus_page)
 			memcpy(va + pgoff, (char *)db->db_data + bufoff, tocpy);
 
 		pgoff += tocpy;
 		ASSERT3S(pgoff, >=, 0);
 		ASSERT3S(pgoff, <=, PAGESIZE);
 		if (pgoff == PAGESIZE) {
 			if (m != bogus_page) {
 				zfs_unmap_page(sf);
 				vm_page_valid(m);
 			}
 			ASSERT3S(mi, <, count);
 			mi++;
 			pgoff = 0;
 		}
 
 		bufoff += tocpy;
 		ASSERT3S(bufoff, >=, 0);
 		ASSERT3S(bufoff, <=, db->db_size);
 		if (bufoff == db->db_size) {
 			ASSERT3S(di, <, numbufs);
 			di++;
 			bufoff = 0;
 		}
 	}
 
 #ifdef ZFS_DEBUG
 	/*
 	 * Three possibilities:
 	 * - last requested page ends at a buffer boundary and , thus,
 	 *   all pages and buffers have been iterated;
 	 * - all requested pages are filled, but the last buffer
 	 *   has not been exhausted;
 	 *   the read-ahead is possible only in this case;
 	 * - all buffers have been read, but the last page has not been
 	 *   fully filled;
 	 *   this is only possible if the file has only a single buffer
 	 *   with a size that is not a multiple of the page size.
 	 */
 	if (mi == count) {
 		ASSERT3S(di, >=, numbufs - 1);
 		IMPLY(*rahead != 0, di == numbufs - 1);
 		IMPLY(*rahead != 0, bufoff != 0);
 		ASSERT0(pgoff);
 	}
 	if (di == numbufs) {
 		ASSERT3S(mi, >=, count - 1);
 		ASSERT0(*rahead);
 		IMPLY(pgoff == 0, mi == count);
 		if (pgoff != 0) {
 			ASSERT3S(mi, ==, count - 1);
 			ASSERT3U((dbp[0]->db_size & PAGE_MASK), !=, 0);
 		}
 	}
 #endif
 	if (pgoff != 0) {
 		ASSERT3P(m, !=, bogus_page);
 		memset(va + pgoff, 0, PAGESIZE - pgoff);
 		zfs_unmap_page(sf);
 		vm_page_valid(m);
 	}
 
 	for (i = 0; i < *rahead; i++) {
 		m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
 		if (m == NULL)
 			break;
 		if (!vm_page_none_valid(m)) {
 			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_do_sunbusy(m);
 			break;
 		}
 		ASSERT3U(m->dirty, ==, 0);
 		ASSERT(!pmap_page_is_write_mapped(m));
 
 		ASSERT3U(db->db_size, >, PAGE_SIZE);
 		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
 		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
 		va = zfs_map_page(m, &sf);
 		memcpy(va, (char *)db->db_data + bufoff, tocpy);
 		if (tocpy < PAGESIZE) {
 			ASSERT3S(i, ==, *rahead - 1);
 			ASSERT3U((db->db_size & PAGE_MASK), !=, 0);
 			memset(va + tocpy, 0, PAGESIZE - tocpy);
 		}
 		zfs_unmap_page(sf);
 		vm_page_valid(m);
 		dmu_page_lock(m);
 		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
 			vm_page_activate(m);
 		else
 			vm_page_deactivate(m);
 		dmu_page_unlock(m);
 		vm_page_do_sunbusy(m);
 	}
 	*rahead = i;
 	zfs_vmobject_wunlock_12(vmobj);
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (0);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
index 00c1acf57710..2bced9ab6446 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -1,351 +1,348 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/cmn_err.h>
 #include <sys/conf.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_userhold.h>
 #include <sys/errno.h>
 #include <sys/eventhandler.h>
 #include <sys/file.h>
 #include <sys/fm/util.h>
 #include <sys/fs/zfs.h>
 #include <sys/kernel.h>
 #include <sys/kmem.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/nvpair.h>
 #include <sys/policy.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/stat.h>
 #include <sys/sunddi.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/vdev.h>
 #include <sys/vdev_removal.h>
 #include <sys/zap.h>
 #include <sys/zcp.h>
 #include <sys/zfeature.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ioctl_compat.h>
 #include <sys/zfs_ioctl_impl.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zio_checksum.h>
 #include <sys/zone.h>
 #include <sys/zvol.h>
 
 #include "zfs_comutil.h"
 #include "zfs_deleg.h"
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_DECL(_vfs_zfs_vdev);
 
 extern uint_t rrw_tsd_key;
 static int zfs_version_ioctl = ZFS_IOCVER_OZFS;
 SYSCTL_DECL(_vfs_zfs_version);
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
     0, "ZFS_IOCTL_VERSION");
 
 static struct cdev *zfsdev;
 
 static struct root_hold_token *zfs_root_token;
 
 extern uint_t rrw_tsd_key;
 extern uint_t zfs_allow_log_key;
 extern uint_t zfs_geom_probe_vdev_key;
 
 static int zfs__init(void);
 static int zfs__fini(void);
 static void zfs_shutdown(void *, int);
 
 static eventhandler_tag zfs_shutdown_event_tag;
 
 #define	ZFS_MIN_KSTACK_PAGES 4
 
 static int
 zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
     struct thread *td)
 {
 	uint_t len;
 	int vecnum;
 	zfs_iocparm_t *zp;
 	zfs_cmd_t *zc;
 #ifdef ZFS_LEGACY_SUPPORT
 	zfs_cmd_legacy_t *zcl;
 #endif
 	int rc, error;
 	void *uaddr;
 
 	len = IOCPARM_LEN(zcmd);
 	vecnum = zcmd & 0xff;
 	zp = (void *)arg;
 	error = 0;
 #ifdef ZFS_LEGACY_SUPPORT
 	zcl = NULL;
 #endif
 
 	if (len != sizeof (zfs_iocparm_t))
 		return (EINVAL);
 
 	uaddr = (void *)(uintptr_t)zp->zfs_cmd;
 	zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 #ifdef ZFS_LEGACY_SUPPORT
 	/*
 	 * Remap ioctl code for legacy user binaries
 	 */
 	if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) {
 		vecnum = zfs_ioctl_legacy_to_ozfs(vecnum);
 		if (vecnum < 0) {
 			vmem_free(zc, sizeof (zfs_cmd_t));
 			return (ENOTSUP);
 		}
 		zcl = vmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP);
 		if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) {
 			error = SET_ERROR(EFAULT);
 			goto out;
 		}
 		zfs_cmd_legacy_to_ozfs(zcl, zc);
 	} else
 #endif
 	if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) {
 		error = SET_ERROR(EFAULT);
 		goto out;
 	}
 	error = zfsdev_ioctl_common(vecnum, zc, 0);
 #ifdef ZFS_LEGACY_SUPPORT
 	if (zcl) {
 		zfs_cmd_ozfs_to_legacy(zc, zcl);
 		rc = copyout(zcl, uaddr, sizeof (*zcl));
 	} else
 #endif
 	{
 		rc = copyout(zc, uaddr, sizeof (*zc));
 	}
 	if (error == 0 && rc != 0)
 		error = SET_ERROR(EFAULT);
 out:
 #ifdef ZFS_LEGACY_SUPPORT
 	if (zcl)
 		vmem_free(zcl, sizeof (zfs_cmd_legacy_t));
 #endif
 	vmem_free(zc, sizeof (zfs_cmd_t));
 	MPASS(tsd_get(rrw_tsd_key) == NULL);
 	return (error);
 }
 
 static void
 zfsdev_close(void *data)
 {
 	zfsdev_state_destroy(data);
 }
 
 void
 zfsdev_private_set_state(void *priv __unused, zfsdev_state_t *zs)
 {
 	devfs_set_cdevpriv(zs, zfsdev_close);
 }
 
 zfsdev_state_t *
 zfsdev_private_get_state(void *priv)
 {
 	return (priv);
 }
 
 static int
 zfsdev_open(struct cdev *devp __unused, int flag __unused, int mode __unused,
     struct thread *td __unused)
 {
 	int error;
 
 	mutex_enter(&zfsdev_state_lock);
 	error = zfsdev_state_init(NULL);
 	mutex_exit(&zfsdev_state_lock);
 
 	return (error);
 }
 
 static struct cdevsw zfs_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	zfsdev_open,
 	.d_ioctl =	zfsdev_ioctl,
 	.d_name =	ZFS_DRIVER
 };
 
 int
 zfsdev_attach(void)
 {
 	struct make_dev_args args;
 
 	make_dev_args_init(&args);
 	args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
 	args.mda_devsw = &zfs_cdevsw;
 	args.mda_cr = NULL;
 	args.mda_uid = UID_ROOT;
 	args.mda_gid = GID_OPERATOR;
 	args.mda_mode = 0666;
 	return (make_dev_s(&args, &zfsdev, ZFS_DRIVER));
 }
 
 void
 zfsdev_detach(void)
 {
 	if (zfsdev != NULL)
 		destroy_dev(zfsdev);
 }
 
 int
 zfs__init(void)
 {
 	int error;
 
 #if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
 	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
 	    "overflow panic!\nPlease consider adding "
 	    "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
 	    ZFS_MIN_KSTACK_PAGES);
 #endif
 	zfs_root_token = root_mount_hold("ZFS");
 	if ((error = zfs_kmod_init()) != 0) {
 		printf("ZFS: Failed to Load ZFS Filesystem"
 		    ", rc = %d\n", error);
 		root_mount_rel(zfs_root_token);
 		return (error);
 	}
 
 
 	tsd_create(&zfs_geom_probe_vdev_key, NULL);
 
 	printf("ZFS storage pool version: features support ("
 	    SPA_VERSION_STRING ")\n");
 	root_mount_rel(zfs_root_token);
 	ddi_sysevent_init();
 	return (0);
 }
 
 int
 zfs__fini(void)
 {
 	if (zfs_busy() || zvol_busy() ||
 	    zio_injection_enabled) {
 		return (EBUSY);
 	}
 	zfs_kmod_fini();
 	tsd_destroy(&zfs_geom_probe_vdev_key);
 	return (0);
 }
 
 static void
 zfs_shutdown(void *arg __unused, int howto __unused)
 {
 
 	/*
 	 * ZFS fini routines can not properly work in a panic-ed system.
 	 */
 	if (panicstr == NULL)
 		zfs__fini();
 }
 
 static int
 zfs_modevent(module_t mod, int type, void *unused __unused)
 {
 	int err;
 
 	switch (type) {
 	case MOD_LOAD:
 		err = zfs__init();
 		if (err == 0)
 			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
 			    shutdown_post_sync, zfs_shutdown, NULL,
 			    SHUTDOWN_PRI_FIRST);
 		return (err);
 	case MOD_UNLOAD:
 		err = zfs__fini();
 		if (err == 0 && zfs_shutdown_event_tag != NULL)
 			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 			    zfs_shutdown_event_tag);
 		return (err);
 	case MOD_SHUTDOWN:
 		return (0);
 	default:
 		break;
 	}
 	return (EOPNOTSUPP);
 }
 
 static moduledata_t zfs_mod = {
 	"zfsctrl",
 	zfs_modevent,
 	0
 };
 
 #ifdef _KERNEL
 EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
 #endif
 
 FEATURE(zfs, "OpenZFS support");
 
 DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY);
 MODULE_VERSION(zfsctrl, 1);
 #if __FreeBSD_version > 1300092
 MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1);
 #else
 MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
 #endif
 MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
 MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1);
 MODULE_DEPEND(zfsctrl, zlib, 1, 1, 1);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index 312d76c3e023..30983b13f7d1 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -1,890 +1,887 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/arc_os.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/sunddi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
 #include <sys/nvpair.h>
 #include <sys/mount.h>
 #include <sys/taskqueue.h>
 #include <sys/sdt.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zcp.h>
 #include <sys/zio_checksum.h>
 #include <sys/vdev_removal.h>
 #include <sys/dsl_crypt.h>
 
 #include <sys/zfs_ioctl_compat.h>
 #include <sys/zfs_context.h>
 
 #include <sys/arc_impl.h>
 #include <sys/dsl_pool.h>
 
 #include <sys/vmmeter.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0,
 	"ZFS adaptive replacement cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0,
 	"ZFS Block Reference Table");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0,
 	"ZFS disk buf cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0,
 	"ZFS multihost protection");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
 
 SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
 	"ZFS livelist condense");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
 	"ZFS VDEV mirror");
 
 SYSCTL_DECL(_vfs_zfs_version);
 SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD,
 	(ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version");
 
 /* arc.c */
 
 int
 param_set_arc_u64(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_64(oidp, arg1, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	arc_tuning_update(B_TRUE);
 
 	return (0);
 }
 
 int
 param_set_arc_int(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_int(oidp, arg1, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	arc_tuning_update(B_TRUE);
 
 	return (0);
 }
 
 int
 param_set_arc_max(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_arc_max;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min ||
 	    val >= arc_all_memory()))
 		return (SET_ERROR(EINVAL));
 
 	zfs_arc_max = val;
 	arc_tuning_update(B_TRUE);
 
 	/* Update the sysctl to the tuned value */
 	if (val != 0)
 		zfs_arc_max = arc_c_max;
 
 	return (0);
 }
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_max, "LU",
 	"Maximum ARC size in bytes (LEGACY)");
 /* END CSTYLED */
 
 int
 param_set_arc_min(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_arc_min;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max))
 		return (SET_ERROR(EINVAL));
 
 	zfs_arc_min = val;
 	arc_tuning_update(B_TRUE);
 
 	/* Update the sysctl to the tuned value */
 	if (val != 0)
 		zfs_arc_min = arc_c_min;
 
 	return (0);
 }
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_min, "LU",
 	"Minimum ARC size in bytes (LEGACY)");
 /* END CSTYLED */
 
 extern uint_t zfs_arc_free_target;
 
 int
 param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
 {
 	uint_t val;
 	int err;
 
 	val = zfs_arc_free_target;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < minfree)
 		return (EINVAL);
 	if (val > vm_cnt.v_page_count)
 		return (EINVAL);
 
 	zfs_arc_free_target = val;
 
 	return (0);
 }
 
 /*
  * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on
  * pagedaemon initialization.
  */
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
 	CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_free_target, "IU",
 	"Desired number of free pages below which ARC triggers reclaim"
 	" (LEGACY)");
 /* END CSTYLED */
 
 int
 param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
 	val = arc_no_grow_shift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 0 || val >= arc_shrink_shift)
 		return (EINVAL);
 
 	arc_no_grow_shift = val;
 
 	return (0);
 }
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
 	CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_no_grow_shift, "I",
 	"log2(fraction of ARC which must be free to allow growing) (LEGACY)");
 /* END CSTYLED */
 
 extern uint64_t l2arc_write_max;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max,
 	CTLFLAG_RWTUN, &l2arc_write_max, 0,
 	"Max write bytes per interval (LEGACY)");
 /* END CSTYLED */
 
 extern uint64_t l2arc_write_boost;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost,
 	CTLFLAG_RWTUN, &l2arc_write_boost, 0,
 	"Extra write bytes during device warmup (LEGACY)");
 /* END CSTYLED */
 
 extern uint64_t l2arc_headroom;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom,
 	CTLFLAG_RWTUN, &l2arc_headroom, 0,
 	"Number of max device writes to precache (LEGACY)");
 /* END CSTYLED */
 
 extern uint64_t l2arc_headroom_boost;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost,
 	CTLFLAG_RWTUN, &l2arc_headroom_boost, 0,
 	"Compressed l2arc_headroom multiplier (LEGACY)");
 /* END CSTYLED */
 
 extern uint64_t l2arc_feed_secs;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs,
 	CTLFLAG_RWTUN, &l2arc_feed_secs, 0,
 	"Seconds between L2ARC writing (LEGACY)");
 /* END CSTYLED */
 
 extern uint64_t l2arc_feed_min_ms;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms,
 	CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0,
 	"Min feed interval in milliseconds (LEGACY)");
 /* END CSTYLED */
 
 extern int l2arc_noprefetch;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch,
 	CTLFLAG_RWTUN, &l2arc_noprefetch, 0,
 	"Skip caching prefetched buffers (LEGACY)");
 /* END CSTYLED */
 
 extern int l2arc_feed_again;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again,
 	CTLFLAG_RWTUN, &l2arc_feed_again, 0,
 	"Turbo L2ARC warmup (LEGACY)");
 /* END CSTYLED */
 
 extern int l2arc_norw;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw,
 	CTLFLAG_RWTUN, &l2arc_norw, 0,
 	"No reads during writes (LEGACY)");
 /* END CSTYLED */
 
 static int
 param_get_arc_state_size(SYSCTL_HANDLER_ARGS)
 {
 	arc_state_t *state = (arc_state_t *)arg1;
 	int64_t val;
 
 	val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 extern arc_state_t ARC_anon;
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_anon, 0, param_get_arc_state_size, "Q",
 	"size of anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
 	&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
 	&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in anonymous state");
 /* END CSTYLED */
 
 extern arc_state_t ARC_mru;
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mru, 0, param_get_arc_state_size, "Q",
 	"size of mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
 	&ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
 	&ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mru state");
 /* END CSTYLED */
 
 extern arc_state_t ARC_mru_ghost;
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mru_ghost, 0, param_get_arc_state_size, "Q",
 	"size of mru ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
 	&ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mru ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
 	&ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mru ghost state");
 /* END CSTYLED */
 
 extern arc_state_t ARC_mfu;
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mfu, 0, param_get_arc_state_size, "Q",
 	"size of mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
 	&ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
 	&ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mfu state");
 /* END CSTYLED */
 
 extern arc_state_t ARC_mfu_ghost;
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mfu_ghost, 0, param_get_arc_state_size, "Q",
 	"size of mfu ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
 	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mfu ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
 	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mfu ghost state");
 /* END CSTYLED */
 
 extern arc_state_t ARC_uncached;
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_uncached, 0, param_get_arc_state_size, "Q",
 	"size of uncached state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
 	&ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in uncached state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
 	&ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in uncached state");
 /* END CSTYLED */
 
 extern arc_state_t ARC_l2c_only;
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_l2c_only, 0, param_get_arc_state_size, "Q",
 	"size of l2c_only state");
 /* END CSTYLED */
 
 /* dbuf.c */
 
 /* dmu.c */
 
 /* dmu_zfetch.c */
 
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
 
 extern uint32_t	zfetch_max_distance;
 
 /* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance,
 	CTLFLAG_RWTUN, &zfetch_max_distance, 0,
 	"Max bytes to prefetch per stream (LEGACY)");
 /* END CSTYLED */
 
 extern uint32_t	zfetch_max_idistance;
 
 /* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance,
 	CTLFLAG_RWTUN, &zfetch_max_idistance, 0,
 	"Max bytes to prefetch indirects for per stream (LEGACY)");
 /* END CSTYLED */
 
 /* dsl_pool.c */
 
 /* dnode.c */
 
 /* dsl_scan.c */
 
 /* metaslab.c */
 
 int
 param_set_active_allocator(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int rc;
 
 	if (req->newptr == NULL)
 		strlcpy(buf, zfs_active_allocator, sizeof (buf));
 
 	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (rc || req->newptr == NULL)
 		return (rc);
 	if (strcmp(buf, zfs_active_allocator) == 0)
 		return (0);
 
 	return (param_set_active_allocator_common(buf));
 }
 
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
  * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk. So a sane default for the space map block size
  * is 8~16K.
  */
 extern int zfs_metaslab_sm_blksz_no_log;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log,
 	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0,
 	"Block size for space map in pools with log space map disabled.  "
 	"Power of 2 greater than 4096.");
 /* END CSTYLED */
 
 /*
  * When the log space map feature is enabled, we accumulate a lot of
  * changes per metaslab that are flushed once in a while so we benefit
  * from a bigger block size like 128K for the metaslab space maps.
  */
 extern int zfs_metaslab_sm_blksz_with_log;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log,
 	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0,
 	"Block size for space map in pools with log space map enabled.  "
 	"Power of 2 greater than 4096.");
 /* END CSTYLED */
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 extern uint_t zfs_condense_pct;
 
 /* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct,
 	CTLFLAG_RWTUN, &zfs_condense_pct, 0,
 	"Condense on-disk spacemap when it is more than this many percents"
 	" of in-memory counterpart");
 /* END CSTYLED */
 
 extern uint_t zfs_remove_max_segment;
 
 /* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment,
 	CTLFLAG_RWTUN, &zfs_remove_max_segment, 0,
 	"Largest contiguous segment ZFS will attempt to allocate when removing"
 	" a device");
 /* END CSTYLED */
 
 extern int zfs_removal_suspend_progress;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress,
 	CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0,
 	"Ensures certain actions can happen while in the middle of a removal");
 /* END CSTYLED */
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 extern uint64_t metaslab_df_alloc_threshold;
 
 /* BEGIN CSTYLED */
 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold,
 	CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0,
 	"Minimum size which forces the dynamic allocator to change its"
 	" allocation strategy");
 /* END CSTYLED */
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 extern uint_t metaslab_df_free_pct;
 
 /* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct,
 	CTLFLAG_RWTUN, &metaslab_df_free_pct, 0,
 	"The minimum free space, in percent, which must be available in a"
 	" space map to continue allocations in a first-fit fashion");
 /* END CSTYLED */
 
 /* mmp.c */
 
 int
 param_set_multihost_interval(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_64(oidp, &zfs_multihost_interval, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (spa_mode_global != SPA_MODE_UNINIT)
 		mmp_signal_all_threads();
 
 	return (0);
 }
 
 /* spa.c */
 
 extern int zfs_ccw_retry_interval;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval,
 	CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0,
 	"Configuration cache file write, retry after failure, interval"
 	" (seconds)");
 /* END CSTYLED */
 
 extern uint64_t zfs_max_missing_tvds_cachefile;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile,
 	CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0,
 	"Allow importing pools with missing top-level vdevs in cache file");
 /* END CSTYLED */
 
 extern uint64_t zfs_max_missing_tvds_scan;
 
 /* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan,
 	CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0,
 	"Allow importing pools with missing top-level vdevs during scan");
 /* END CSTYLED */
 
 /* spa_misc.c */
 
 extern int zfs_flags;
 
 static int
 sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
 	val = zfs_flags;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	/*
 	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
 	 * arc buffers in the system have the necessary additional
 	 * checksum data.  However, it is safe to disable at any
 	 * time.
 	 */
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		val &= ~ZFS_DEBUG_MODIFY;
 	zfs_flags = val;
 
 	return (0);
 }
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
 	CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0,
 	sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
 /* END CSTYLED */
 
 int
 param_set_deadman_synctime(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_deadman_synctime_ms;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 	zfs_deadman_synctime_ms = val;
 
 	spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
 
 	return (0);
 }
 
 int
 param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_deadman_ziotime_ms;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 	zfs_deadman_ziotime_ms = val;
 
 	spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms));
 
 	return (0);
 }
 
 int
 param_set_deadman_failmode(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int rc;
 
 	if (req->newptr == NULL)
 		strlcpy(buf, zfs_deadman_failmode, sizeof (buf));
 
 	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (rc || req->newptr == NULL)
 		return (rc);
 	if (strcmp(buf, zfs_deadman_failmode) == 0)
 		return (0);
 	if (strcmp(buf, "wait") == 0)
 		zfs_deadman_failmode = "wait";
 	if (strcmp(buf, "continue") == 0)
 		zfs_deadman_failmode = "continue";
 	if (strcmp(buf, "panic") == 0)
 		zfs_deadman_failmode = "panic";
 
 	return (-param_set_deadman_failmode_common(buf));
 }
 
 int
 param_set_slop_shift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = spa_slop_shift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 1 || val > 31)
 		return (EINVAL);
 
 	spa_slop_shift = val;
 
 	return (0);
 }
 
 /* spacemap.c */
 
 extern int space_map_ibs;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
 	&space_map_ibs, 0, "Space map indirect block shift");
 /* END CSTYLED */
 
 
 /* vdev.c */
 
 int
 param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = zfs_vdev_min_auto_ashift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
 		return (SET_ERROR(EINVAL));
 
 	zfs_vdev_min_auto_ashift = val;
 
 	return (0);
 }
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
 	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	&zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
 	param_set_min_auto_ashift, "IU",
 	"Min ashift used when creating new top-level vdev. (LEGACY)");
 /* END CSTYLED */
 
 int
 param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = zfs_vdev_max_auto_ashift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
 		return (SET_ERROR(EINVAL));
 
 	zfs_vdev_max_auto_ashift = val;
 
 	return (0);
 }
 
 /* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
 	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	&zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
 	param_set_max_auto_ashift, "IU",
 	"Max ashift used when optimizing for logical -> physical sector size on"
 	" new top-level vdevs. (LEGACY)");
 /* END CSTYLED */
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 extern int zfs_vdev_dtl_sm_blksz;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz,
 	CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0,
 	"Block size for DTL space map.  Power of 2 greater than 4096.");
 /* END CSTYLED */
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 extern int zfs_vdev_standard_sm_blksz;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz,
 	CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0,
 	"Block size for standard space map.  Power of 2 greater than 4096.");
 /* END CSTYLED */
 
 extern int vdev_validate_skip;
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip,
 	CTLFLAG_RDTUN, &vdev_validate_skip, 0,
 	"Enable to bypass vdev_validate().");
 /* END CSTYLED */
 
 /* vdev_mirror.c */
 
 /* vdev_queue.c */
 
 extern uint_t zfs_vdev_max_active;
 
 /* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight,
 	CTLFLAG_RWTUN, &zfs_vdev_max_active, 0,
 	"The maximum number of I/Os of all types active for each device."
 	" (LEGACY)");
 /* END CSTYLED */
 
 /* zio.c */
 
 /* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata,
 	CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
 	"Exclude metadata buffers from dumps as well");
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
index 338982ff6873..79732d9173e8 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
@@ -1,133 +1,133 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_os.h>
 #include <sys/vdev_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
 int
 vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 	zio_t *zio;
 	abd_t *pad2;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 	int error;
 
 	if (size > VDEV_PAD_SIZE)
 		return (EINVAL);
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (ENODEV);
 	if (vdev_is_dead(vd))
 		return (ENXIO);
 
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_WRITER), ==, SCL_ALL);
 
 	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
-	abd_zero(pad2, VDEV_PAD_SIZE);
 	abd_copy_from_buf(pad2, buf, size);
+	abd_zero_off(pad2, size, VDEV_PAD_SIZE - size);
 
 retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 	vdev_label_write(zio, vd, 0, pad2,
 	    offsetof(vdev_label_t, vl_be),
 	    VDEV_PAD_SIZE, NULL, NULL, flags);
 	error = zio_wait(zio);
 	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	abd_free(pad2);
 	return (error);
 }
 
 static void
 vdev_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	mutex_enter(&pio->io_lock);
 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Check if the reserved boot area is in-use.
  *
  * When booting FreeBSD with an MBR partition with ZFS, the zfsboot file
  * (which understands the ZFS file system) is written to the ZFS BOOT
  * reserve area (at offset 512K). We check for that here before attaching
  * a disk to raidz which would then corrupt this boot data.
  */
 int
 vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd)
 {
 	ASSERT(childvd->vdev_ops->vdev_op_leaf);
 
 	size_t size = SPA_MINBLOCKSIZE;
 	abd_t *abd = abd_alloc_linear(size, B_FALSE);
 
 	zio_t *pio = zio_root(spa, NULL, NULL, 0);
 	/*
 	 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to the offset
 	 * to calculate the physical offset to write to.  Passing in a negative
 	 * offset lets us access the boot area.
 	 */
 	zio_nowait(zio_vdev_child_io(pio, NULL, childvd,
 	    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abd, size, ZIO_TYPE_READ,
 	    ZIO_PRIORITY_ASYNC_READ, 0, vdev_child_done, pio));
 	zio_wait(pio);
 
 	unsigned char *buf = abd_to_buf(abd);
 
 	/*
 	 * The BTX server has a special header at the begining.
 	 *
 	 * btx_hdr:	.byte 0xeb		# Machine ID
 	 *		.byte 0xe		# Header size
 	 *		.ascii "BTX"		# Magic
 	 *		.byte 0x1		# Major version
 	 *		.byte 0x2		# Minor version
 	 *		.byte BTX_FLAGS		# Flags
 	 */
 	if (buf[0] == 0xeb && buf[1] == 0x0e &&
 	    buf[2] == 'B' && buf[3] == 'T' && buf[4] == 'X') {
 		abd_free(abd);
 		return (EBUSY);
 	}
 
 	abd_free(abd);
 	return (0);
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index 60c9ff0581e0..9d68499d82ec 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -1,307 +1,304 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_recv.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_file.h>
 #include <sys/buf.h>
 #include <sys/stat.h>
 
 int
 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
 {
 	struct thread *td;
 	int rc, fd;
 
 	td = curthread;
 	pwd_ensure_dirs();
 	/* 12.x doesn't take a const char * */
 	rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path),
 	    UIO_SYSSPACE, flags, mode);
 	if (rc)
 		return (SET_ERROR(rc));
 	fd = td->td_retval[0];
 	td->td_retval[0] = 0;
 	if (fget(curthread, fd, &cap_no_rights, fpp))
 		kern_close(td, fd);
 	return (0);
 }
 
 void
 zfs_file_close(zfs_file_t *fp)
 {
 	fo_close(fp, curthread);
 }
 
 static int
 zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp,
     ssize_t *resid)
 {
 	ssize_t rc;
 	struct uio auio;
 	struct thread *td;
 	struct iovec aiov;
 
 	td = curthread;
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_resid = count;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = *offp;
 
 	if ((fp->f_flag & FWRITE) == 0)
 		return (SET_ERROR(EBADF));
 
 	if (fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 
 	rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td);
 	if (rc)
 		return (SET_ERROR(rc));
 	if (resid)
 		*resid = auio.uio_resid;
 	else if (auio.uio_resid)
 		return (SET_ERROR(EIO));
 	*offp += count - auio.uio_resid;
 	return (rc);
 }
 
 int
 zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 {
 	loff_t off = fp->f_offset;
 	ssize_t rc;
 
 	rc = zfs_file_write_impl(fp, buf, count, &off, resid);
 	if (rc == 0)
 		fp->f_offset = off;
 
 	return (SET_ERROR(rc));
 }
 
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
     ssize_t *resid)
 {
 	return (zfs_file_write_impl(fp, buf, count, &off, resid));
 }
 
 static int
 zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp,
     ssize_t *resid)
 {
 	ssize_t rc;
 	struct uio auio;
 	struct thread *td;
 	struct iovec aiov;
 
 	td = curthread;
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_resid = count;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = *offp;
 
 	if ((fp->f_flag & FREAD) == 0)
 		return (SET_ERROR(EBADF));
 
 	rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td);
 	if (rc)
 		return (SET_ERROR(rc));
 	if (resid)
 		*resid = auio.uio_resid;
 	*offp += count - auio.uio_resid;
 	return (SET_ERROR(0));
 }
 
 int
 zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
 {
 	loff_t off = fp->f_offset;
 	ssize_t rc;
 
 	rc = zfs_file_read_impl(fp, buf, count, &off, resid);
 	if (rc == 0)
 		fp->f_offset = off;
 	return (rc);
 }
 
 int
 zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
     ssize_t *resid)
 {
 	return (zfs_file_read_impl(fp, buf, count, &off, resid));
 }
 
 int
 zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
 {
 	int rc;
 	struct thread *td;
 
 	td = curthread;
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
 		return (SET_ERROR(ESPIPE));
 	rc = fo_seek(fp, *offp, whence, td);
 	if (rc == 0)
 		*offp = td->td_uretoff.tdu_off;
 	return (SET_ERROR(rc));
 }
 
 int
 zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
 {
 	struct thread *td;
 	struct stat sb;
 	int rc;
 
 	td = curthread;
 
 #if __FreeBSD_version < 1400037
 	rc = fo_stat(fp, &sb, td->td_ucred, td);
 #else
 	rc = fo_stat(fp, &sb, td->td_ucred);
 #endif
 	if (rc)
 		return (SET_ERROR(rc));
 	zfattr->zfa_size = sb.st_size;
 	zfattr->zfa_mode = sb.st_mode;
 
 	return (0);
 }
 
 static __inline int
 zfs_vop_fsync(vnode_t *vp)
 {
 	struct mount *mp;
 	int error;
 
 #if __FreeBSD_version < 1400068
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 #else
 	if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0)
 #endif
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 	VOP_UNLOCK1(vp);
 	vn_finished_write(mp);
 drop:
 	return (SET_ERROR(error));
 }
 
 int
 zfs_file_fsync(zfs_file_t *fp, int flags)
 {
 	if (fp->f_type != DTYPE_VNODE)
 		return (EINVAL);
 
 	return (zfs_vop_fsync(fp->f_vnode));
 }
 
 zfs_file_t *
 zfs_file_get(int fd)
 {
 	struct file *fp;
 
 	if (fget(curthread, fd, &cap_no_rights, &fp))
 		return (NULL);
 
 	return (fp);
 }
 
 void
 zfs_file_put(zfs_file_t *fp)
 {
 	fdrop(fp, curthread);
 }
 
 loff_t
 zfs_file_off(zfs_file_t *fp)
 {
 	return (fp->f_offset);
 }
 
 void *
 zfs_file_private(zfs_file_t *fp)
 {
 	file_t *tmpfp;
 	void *data;
 	int error;
 
 	tmpfp = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = devfs_get_cdevpriv(&data);
 	curthread->td_fpop = tmpfp;
 	if (error != 0)
 		return (NULL);
 	return (data);
 }
 
 int
 zfs_file_unlink(const char *fnamep)
 {
 	zfs_uio_seg_t seg = UIO_SYSSPACE;
 	int rc;
 
 #if __FreeBSD_version >= 1300018
 	rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0);
 #elif __FreeBSD_version >= 1202504 || defined(AT_BENEATH)
 	rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
 	    seg, 0, 0);
 #else
 	rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
 	    seg, 0);
 #endif
 	return (SET_ERROR(rc));
 }
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
index 3ddffec91e83..4a7beb650a8d 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
@@ -1,365 +1,362 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/errno.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_ioctl_compat.h>
 
 #ifdef ZFS_LEGACY_SUPPORT
 enum zfs_ioc_legacy {
 	ZFS_IOC_LEGACY_NONE =	-1,
 	ZFS_IOC_LEGACY_FIRST =	0,
 	ZFS_LEGACY_IOC = ZFS_IOC_LEGACY_FIRST,
 	ZFS_IOC_LEGACY_POOL_CREATE = ZFS_IOC_LEGACY_FIRST,
 	ZFS_IOC_LEGACY_POOL_DESTROY,
 	ZFS_IOC_LEGACY_POOL_IMPORT,
 	ZFS_IOC_LEGACY_POOL_EXPORT,
 	ZFS_IOC_LEGACY_POOL_CONFIGS,
 	ZFS_IOC_LEGACY_POOL_STATS,
 	ZFS_IOC_LEGACY_POOL_TRYIMPORT,
 	ZFS_IOC_LEGACY_POOL_SCAN,
 	ZFS_IOC_LEGACY_POOL_FREEZE,
 	ZFS_IOC_LEGACY_POOL_UPGRADE,
 	ZFS_IOC_LEGACY_POOL_GET_HISTORY,
 	ZFS_IOC_LEGACY_VDEV_ADD,
 	ZFS_IOC_LEGACY_VDEV_REMOVE,
 	ZFS_IOC_LEGACY_VDEV_SET_STATE,
 	ZFS_IOC_LEGACY_VDEV_ATTACH,
 	ZFS_IOC_LEGACY_VDEV_DETACH,
 	ZFS_IOC_LEGACY_VDEV_SETPATH,
 	ZFS_IOC_LEGACY_VDEV_SETFRU,
 	ZFS_IOC_LEGACY_OBJSET_STATS,
 	ZFS_IOC_LEGACY_OBJSET_ZPLPROPS,
 	ZFS_IOC_LEGACY_DATASET_LIST_NEXT,
 	ZFS_IOC_LEGACY_SNAPSHOT_LIST_NEXT,
 	ZFS_IOC_LEGACY_SET_PROP,
 	ZFS_IOC_LEGACY_CREATE,
 	ZFS_IOC_LEGACY_DESTROY,
 	ZFS_IOC_LEGACY_ROLLBACK,
 	ZFS_IOC_LEGACY_RENAME,
 	ZFS_IOC_LEGACY_RECV,
 	ZFS_IOC_LEGACY_SEND,
 	ZFS_IOC_LEGACY_INJECT_FAULT,
 	ZFS_IOC_LEGACY_CLEAR_FAULT,
 	ZFS_IOC_LEGACY_INJECT_LIST_NEXT,
 	ZFS_IOC_LEGACY_ERROR_LOG,
 	ZFS_IOC_LEGACY_CLEAR,
 	ZFS_IOC_LEGACY_PROMOTE,
 	ZFS_IOC_LEGACY_DESTROY_SNAPS,
 	ZFS_IOC_LEGACY_SNAPSHOT,
 	ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME,
 	ZFS_IOC_LEGACY_OBJ_TO_PATH,
 	ZFS_IOC_LEGACY_POOL_SET_PROPS,
 	ZFS_IOC_LEGACY_POOL_GET_PROPS,
 	ZFS_IOC_LEGACY_SET_FSACL,
 	ZFS_IOC_LEGACY_GET_FSACL,
 	ZFS_IOC_LEGACY_SHARE,
 	ZFS_IOC_LEGACY_INHERIT_PROP,
 	ZFS_IOC_LEGACY_SMB_ACL,
 	ZFS_IOC_LEGACY_USERSPACE_ONE,
 	ZFS_IOC_LEGACY_USERSPACE_MANY,
 	ZFS_IOC_LEGACY_USERSPACE_UPGRADE,
 	ZFS_IOC_LEGACY_HOLD,
 	ZFS_IOC_LEGACY_RELEASE,
 	ZFS_IOC_LEGACY_GET_HOLDS,
 	ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS,
 	ZFS_IOC_LEGACY_VDEV_SPLIT,
 	ZFS_IOC_LEGACY_NEXT_OBJ,
 	ZFS_IOC_LEGACY_DIFF,
 	ZFS_IOC_LEGACY_TMP_SNAPSHOT,
 	ZFS_IOC_LEGACY_OBJ_TO_STATS,
 	ZFS_IOC_LEGACY_JAIL,
 	ZFS_IOC_LEGACY_UNJAIL,
 	ZFS_IOC_LEGACY_POOL_REGUID,
 	ZFS_IOC_LEGACY_SPACE_WRITTEN,
 	ZFS_IOC_LEGACY_SPACE_SNAPS,
 	ZFS_IOC_LEGACY_SEND_PROGRESS,
 	ZFS_IOC_LEGACY_POOL_REOPEN,
 	ZFS_IOC_LEGACY_LOG_HISTORY,
 	ZFS_IOC_LEGACY_SEND_NEW,
 	ZFS_IOC_LEGACY_SEND_SPACE,
 	ZFS_IOC_LEGACY_CLONE,
 	ZFS_IOC_LEGACY_BOOKMARK,
 	ZFS_IOC_LEGACY_GET_BOOKMARKS,
 	ZFS_IOC_LEGACY_DESTROY_BOOKMARKS,
 	ZFS_IOC_LEGACY_NEXTBOOT,
 	ZFS_IOC_LEGACY_CHANNEL_PROGRAM,
 	ZFS_IOC_LEGACY_REMAP,
 	ZFS_IOC_LEGACY_POOL_CHECKPOINT,
 	ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT,
 	ZFS_IOC_LEGACY_POOL_INITIALIZE,
 	ZFS_IOC_LEGACY_POOL_SYNC,
 	ZFS_IOC_LEGACY_LAST
 };
 
 static unsigned long zfs_ioctl_legacy_to_ozfs_[] = {
 	ZFS_IOC_POOL_CREATE,			/* 0x00 */
 	ZFS_IOC_POOL_DESTROY,			/* 0x01 */
 	ZFS_IOC_POOL_IMPORT,			/* 0x02 */
 	ZFS_IOC_POOL_EXPORT,			/* 0x03 */
 	ZFS_IOC_POOL_CONFIGS,			/* 0x04 */
 	ZFS_IOC_POOL_STATS,			/* 0x05 */
 	ZFS_IOC_POOL_TRYIMPORT,			/* 0x06 */
 	ZFS_IOC_POOL_SCAN,			/* 0x07 */
 	ZFS_IOC_POOL_FREEZE,			/* 0x08 */
 	ZFS_IOC_POOL_UPGRADE,			/* 0x09 */
 	ZFS_IOC_POOL_GET_HISTORY,		/* 0x0a */
 	ZFS_IOC_VDEV_ADD,			/* 0x0b */
 	ZFS_IOC_VDEV_REMOVE,			/* 0x0c */
 	ZFS_IOC_VDEV_SET_STATE,			/* 0x0d */
 	ZFS_IOC_VDEV_ATTACH,			/* 0x0e */
 	ZFS_IOC_VDEV_DETACH,			/* 0x0f */
 	ZFS_IOC_VDEV_SETPATH,			/* 0x10 */
 	ZFS_IOC_VDEV_SETFRU,			/* 0x11 */
 	ZFS_IOC_OBJSET_STATS,			/* 0x12 */
 	ZFS_IOC_OBJSET_ZPLPROPS,		/* 0x13 */
 	ZFS_IOC_DATASET_LIST_NEXT,		/* 0x14 */
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,		/* 0x15 */
 	ZFS_IOC_SET_PROP,			/* 0x16 */
 	ZFS_IOC_CREATE,				/* 0x17 */
 	ZFS_IOC_DESTROY,			/* 0x18 */
 	ZFS_IOC_ROLLBACK,			/* 0x19 */
 	ZFS_IOC_RENAME,				/* 0x1a */
 	ZFS_IOC_RECV,				/* 0x1b */
 	ZFS_IOC_SEND,				/* 0x1c */
 	ZFS_IOC_INJECT_FAULT,			/* 0x1d */
 	ZFS_IOC_CLEAR_FAULT,			/* 0x1e */
 	ZFS_IOC_INJECT_LIST_NEXT,		/* 0x1f */
 	ZFS_IOC_ERROR_LOG,			/* 0x20 */
 	ZFS_IOC_CLEAR,				/* 0x21 */
 	ZFS_IOC_PROMOTE,			/* 0x22 */
 	/* start of mismatch */
 
 	ZFS_IOC_DESTROY_SNAPS,			/* 0x23:0x3b */
 	ZFS_IOC_SNAPSHOT,			/* 0x24:0x23 */
 	ZFS_IOC_DSOBJ_TO_DSNAME,		/* 0x25:0x24 */
 	ZFS_IOC_OBJ_TO_PATH,			/* 0x26:0x25 */
 	ZFS_IOC_POOL_SET_PROPS,			/* 0x27:0x26 */
 	ZFS_IOC_POOL_GET_PROPS,			/* 0x28:0x27 */
 	ZFS_IOC_SET_FSACL,			/* 0x29:0x28 */
 	ZFS_IOC_GET_FSACL,			/* 0x30:0x29 */
 	ZFS_IOC_SHARE,				/* 0x2b:0x2a */
 	ZFS_IOC_INHERIT_PROP,			/* 0x2c:0x2b */
 	ZFS_IOC_SMB_ACL,			/* 0x2d:0x2c */
 	ZFS_IOC_USERSPACE_ONE,			/* 0x2e:0x2d */
 	ZFS_IOC_USERSPACE_MANY,			/* 0x2f:0x2e */
 	ZFS_IOC_USERSPACE_UPGRADE,		/* 0x30:0x2f */
 	ZFS_IOC_HOLD,				/* 0x31:0x30 */
 	ZFS_IOC_RELEASE,			/* 0x32:0x31 */
 	ZFS_IOC_GET_HOLDS,			/* 0x33:0x32 */
 	ZFS_IOC_OBJSET_RECVD_PROPS,		/* 0x34:0x33 */
 	ZFS_IOC_VDEV_SPLIT,			/* 0x35:0x34 */
 	ZFS_IOC_NEXT_OBJ,			/* 0x36:0x35 */
 	ZFS_IOC_DIFF,				/* 0x37:0x36 */
 	ZFS_IOC_TMP_SNAPSHOT,			/* 0x38:0x37 */
 	ZFS_IOC_OBJ_TO_STATS,			/* 0x39:0x38 */
 	ZFS_IOC_JAIL,			/* 0x3a:0xc2 */
 	ZFS_IOC_UNJAIL,			/* 0x3b:0xc3 */
 	ZFS_IOC_POOL_REGUID,			/* 0x3c:0x3c */
 	ZFS_IOC_SPACE_WRITTEN,			/* 0x3d:0x39 */
 	ZFS_IOC_SPACE_SNAPS,			/* 0x3e:0x3a */
 	ZFS_IOC_SEND_PROGRESS,			/* 0x3f:0x3e */
 	ZFS_IOC_POOL_REOPEN,			/* 0x40:0x3d */
 	ZFS_IOC_LOG_HISTORY,			/* 0x41:0x3f */
 	ZFS_IOC_SEND_NEW,			/* 0x42:0x40 */
 	ZFS_IOC_SEND_SPACE,			/* 0x43:0x41 */
 	ZFS_IOC_CLONE,				/* 0x44:0x42 */
 	ZFS_IOC_BOOKMARK,			/* 0x45:0x43 */
 	ZFS_IOC_GET_BOOKMARKS,			/* 0x46:0x44 */
 	ZFS_IOC_DESTROY_BOOKMARKS,		/* 0x47:0x45 */
 	ZFS_IOC_NEXTBOOT,			/* 0x48:0xc1 */
 	ZFS_IOC_CHANNEL_PROGRAM,		/* 0x49:0x48 */
 	ZFS_IOC_REMAP,				/* 0x4a:0x4c */
 	ZFS_IOC_POOL_CHECKPOINT,		/* 0x4b:0x4d */
 	ZFS_IOC_POOL_DISCARD_CHECKPOINT,	/* 0x4c:0x4e */
 	ZFS_IOC_POOL_INITIALIZE,		/* 0x4d:0x4f */
 };
 
 static unsigned long zfs_ioctl_ozfs_to_legacy_common_[] = {
 	ZFS_IOC_POOL_CREATE,			/* 0x00 */
 	ZFS_IOC_POOL_DESTROY,			/* 0x01 */
 	ZFS_IOC_POOL_IMPORT,			/* 0x02 */
 	ZFS_IOC_POOL_EXPORT,			/* 0x03 */
 	ZFS_IOC_POOL_CONFIGS,			/* 0x04 */
 	ZFS_IOC_POOL_STATS,			/* 0x05 */
 	ZFS_IOC_POOL_TRYIMPORT,			/* 0x06 */
 	ZFS_IOC_POOL_SCAN,			/* 0x07 */
 	ZFS_IOC_POOL_FREEZE,			/* 0x08 */
 	ZFS_IOC_POOL_UPGRADE,			/* 0x09 */
 	ZFS_IOC_POOL_GET_HISTORY,		/* 0x0a */
 	ZFS_IOC_VDEV_ADD,			/* 0x0b */
 	ZFS_IOC_VDEV_REMOVE,			/* 0x0c */
 	ZFS_IOC_VDEV_SET_STATE,			/* 0x0d */
 	ZFS_IOC_VDEV_ATTACH,			/* 0x0e */
 	ZFS_IOC_VDEV_DETACH,			/* 0x0f */
 	ZFS_IOC_VDEV_SETPATH,			/* 0x10 */
 	ZFS_IOC_VDEV_SETFRU,			/* 0x11 */
 	ZFS_IOC_OBJSET_STATS,			/* 0x12 */
 	ZFS_IOC_OBJSET_ZPLPROPS,		/* 0x13 */
 	ZFS_IOC_DATASET_LIST_NEXT,		/* 0x14 */
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,		/* 0x15 */
 	ZFS_IOC_SET_PROP,			/* 0x16 */
 	ZFS_IOC_CREATE,				/* 0x17 */
 	ZFS_IOC_DESTROY,			/* 0x18 */
 	ZFS_IOC_ROLLBACK,			/* 0x19 */
 	ZFS_IOC_RENAME,				/* 0x1a */
 	ZFS_IOC_RECV,				/* 0x1b */
 	ZFS_IOC_SEND,				/* 0x1c */
 	ZFS_IOC_INJECT_FAULT,			/* 0x1d */
 	ZFS_IOC_CLEAR_FAULT,			/* 0x1e */
 	ZFS_IOC_INJECT_LIST_NEXT,		/* 0x1f */
 	ZFS_IOC_ERROR_LOG,			/* 0x20 */
 	ZFS_IOC_CLEAR,				/* 0x21 */
 	ZFS_IOC_PROMOTE,			/* 0x22 */
 	/* start of mismatch */
 	ZFS_IOC_LEGACY_SNAPSHOT,		/* 0x23 */
 	ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME,		/* 0x24 */
 	ZFS_IOC_LEGACY_OBJ_TO_PATH,		/* 0x25 */
 	ZFS_IOC_LEGACY_POOL_SET_PROPS,		/* 0x26 */
 	ZFS_IOC_LEGACY_POOL_GET_PROPS,		/* 0x27 */
 	ZFS_IOC_LEGACY_SET_FSACL,		/* 0x28 */
 	ZFS_IOC_LEGACY_GET_FSACL,		/* 0x29 */
 	ZFS_IOC_LEGACY_SHARE,			/* 0x2a */
 	ZFS_IOC_LEGACY_INHERIT_PROP,		/* 0x2b */
 	ZFS_IOC_LEGACY_SMB_ACL,			/* 0x2c */
 	ZFS_IOC_LEGACY_USERSPACE_ONE,		/* 0x2d */
 	ZFS_IOC_LEGACY_USERSPACE_MANY,		/* 0x2e */
 	ZFS_IOC_LEGACY_USERSPACE_UPGRADE,	/* 0x2f */
 	ZFS_IOC_LEGACY_HOLD,			/* 0x30 */
 	ZFS_IOC_LEGACY_RELEASE,			/* 0x31 */
 	ZFS_IOC_LEGACY_GET_HOLDS,		/* 0x32 */
 	ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS,	/* 0x33 */
 	ZFS_IOC_LEGACY_VDEV_SPLIT,		/* 0x34 */
 	ZFS_IOC_LEGACY_NEXT_OBJ,		/* 0x35 */
 	ZFS_IOC_LEGACY_DIFF,			/* 0x36 */
 	ZFS_IOC_LEGACY_TMP_SNAPSHOT,		/* 0x37 */
 	ZFS_IOC_LEGACY_OBJ_TO_STATS,		/* 0x38 */
 	ZFS_IOC_LEGACY_SPACE_WRITTEN,		/* 0x39 */
 	ZFS_IOC_LEGACY_SPACE_SNAPS,		/* 0x3a */
 	ZFS_IOC_LEGACY_DESTROY_SNAPS,		/* 0x3b */
 	ZFS_IOC_LEGACY_POOL_REGUID,		/* 0x3c */
 	ZFS_IOC_LEGACY_POOL_REOPEN,		/* 0x3d */
 	ZFS_IOC_LEGACY_SEND_PROGRESS,		/* 0x3e */
 	ZFS_IOC_LEGACY_LOG_HISTORY,		/* 0x3f */
 	ZFS_IOC_LEGACY_SEND_NEW,		/* 0x40 */
 	ZFS_IOC_LEGACY_SEND_SPACE,		/* 0x41 */
 	ZFS_IOC_LEGACY_CLONE,			/* 0x42 */
 	ZFS_IOC_LEGACY_BOOKMARK,		/* 0x43 */
 	ZFS_IOC_LEGACY_GET_BOOKMARKS,		/* 0x44 */
 	ZFS_IOC_LEGACY_DESTROY_BOOKMARKS,	/* 0x45 */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_RECV_NEW */
 	ZFS_IOC_LEGACY_POOL_SYNC,		/* 0x47 */
 	ZFS_IOC_LEGACY_CHANNEL_PROGRAM,		/* 0x48 */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_LOAD_KEY */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_UNLOAD_KEY */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_CHANGE_KEY */
 	ZFS_IOC_LEGACY_REMAP,			/* 0x4c */
 	ZFS_IOC_LEGACY_POOL_CHECKPOINT,		/* 0x4d */
 	ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT,	/* 0x4e */
 	ZFS_IOC_LEGACY_POOL_INITIALIZE,		/* 0x4f  */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_POOL_TRIM */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_REDACT */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOKMARK_PROPS */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT_FS */
 };
 
 static unsigned long zfs_ioctl_ozfs_to_legacy_platform_[] = {
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_NEXT */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_CLEAR */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_SEEK */
 	ZFS_IOC_LEGACY_NEXTBOOT,
 	ZFS_IOC_LEGACY_JAIL,
 	ZFS_IOC_LEGACY_UNJAIL,
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_SET_BOOTENV */
 	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOTENV */
 };
 
 int
 zfs_ioctl_legacy_to_ozfs(int request)
 {
 	if (request >= sizeof (zfs_ioctl_legacy_to_ozfs_)/sizeof (long))
 		return (-1);
 	return (zfs_ioctl_legacy_to_ozfs_[request]);
 }
 
 int
 zfs_ioctl_ozfs_to_legacy(int request)
 {
 	if (request >= ZFS_IOC_LAST)
 		return (-1);
 
 	if (request > ZFS_IOC_PLATFORM) {
 		request -= ZFS_IOC_PLATFORM + 1;
 		return (zfs_ioctl_ozfs_to_legacy_platform_[request]);
 	}
 	if (request >= sizeof (zfs_ioctl_ozfs_to_legacy_common_)/sizeof (long))
 		return (-1);
 	return (zfs_ioctl_ozfs_to_legacy_common_[request]);
 }
 
 void
 zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst)
 {
 	memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
 	*&dst->zc_objset_stats = *&src->zc_objset_stats;
 	memcpy(&dst->zc_begin_record, &src->zc_begin_record,
 	    offsetof(zfs_cmd_t, zc_sendobj) -
 	    offsetof(zfs_cmd_t, zc_begin_record));
 	memcpy(&dst->zc_sendobj, &src->zc_sendobj,
 	    sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
 	dst->zc_zoneid = src->zc_jailid;
 }
 
 void
 zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst)
 {
 	memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
 	*&dst->zc_objset_stats = *&src->zc_objset_stats;
 	*&dst->zc_begin_record.drr_u.drr_begin = *&src->zc_begin_record;
 	dst->zc_begin_record.drr_payloadlen = 0;
 	dst->zc_begin_record.drr_type = 0;
 
 	memcpy(&dst->zc_inject_record, &src->zc_inject_record,
 	    offsetof(zfs_cmd_t, zc_sendobj) -
 	    offsetof(zfs_cmd_t, zc_inject_record));
 	dst->zc_resumable = B_FALSE;
 	memcpy(&dst->zc_sendobj, &src->zc_sendobj,
 	    sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
 	dst->zc_jailid = src->zc_zoneid;
 }
 #endif /* ZFS_LEGACY_SUPPORT */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
index a835e013d630..b8f5fa4e7543 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
@@ -1,178 +1,175 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/nvpair.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_os.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zone.h>
 #include <vm/vm_pageout.h>
 
 #include <sys/zfs_ioctl_impl.h>
 
 #if __FreeBSD_version < 1201517
 #define	vm_page_max_user_wired	vm_page_max_wired
 #endif
 
 int
 zfs_vfs_ref(zfsvfs_t **zfvp)
 {
 	int error = 0;
 
 	if (*zfvp == NULL)
 		return (SET_ERROR(ESRCH));
 
 	error = vfs_busy((*zfvp)->z_vfs, 0);
 	if (error != 0) {
 		*zfvp = NULL;
 		error = SET_ERROR(ESRCH);
 	}
 	return (error);
 }
 
 boolean_t
 zfs_vfs_held(zfsvfs_t *zfsvfs)
 {
 	return (zfsvfs->z_vfs != NULL);
 }
 
 void
 zfs_vfs_rele(zfsvfs_t *zfsvfs)
 {
 	vfs_unbusy(zfsvfs->z_vfs);
 }
 
 static const zfs_ioc_key_t zfs_keys_nextboot[] = {
 	{"command",		DATA_TYPE_STRING,	0},
 	{ ZPOOL_CONFIG_POOL_GUID,		DATA_TYPE_UINT64,	0},
 	{ ZPOOL_CONFIG_GUID,		DATA_TYPE_UINT64,	0}
 };
 
 static int
 zfs_ioc_jail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
 	    (int)zc->zc_zoneid));
 }
 
 static int
 zfs_ioc_unjail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
 	    (int)zc->zc_zoneid));
 }
 
 static int
 zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	char name[MAXNAMELEN];
 	spa_t *spa;
 	vdev_t *vd;
 	const char *command;
 	uint64_t pool_guid;
 	uint64_t vdev_guid;
 	int error;
 
 	if (nvlist_lookup_uint64(innvl,
 	    ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
 		return (EINVAL);
 	if (nvlist_lookup_uint64(innvl,
 	    ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
 		return (EINVAL);
 	if (nvlist_lookup_string(innvl,
 	    "command", &command) != 0)
 		return (EINVAL);
 
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_by_guid(pool_guid, vdev_guid);
 	if (spa != NULL)
 		strcpy(name, spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 	if (spa == NULL)
 		return (ENOENT);
 
 	if ((error = spa_open(name, &spa, FTAG)) != 0)
 		return (error);
 	spa_vdev_state_enter(spa, SCL_ALL);
 	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
 	if (vd == NULL) {
 		(void) spa_vdev_state_exit(spa, NULL, ENXIO);
 		spa_close(spa, FTAG);
 		return (ENODEV);
 	}
 	error = vdev_label_write_pad2(vd, command, strlen(command));
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /* Update the VFS's cache of mountpoint properties */
 void
 zfs_ioctl_update_mount_cache(const char *dsname)
 {
 	zfsvfs_t *zfsvfs;
 
 	if (getzfsvfs(dsname, &zfsvfs) == 0) {
 		struct mount *mp = zfsvfs->z_vfs;
 		VFS_STATFS(mp, &mp->mnt_stat);
 		zfs_vfs_rele(zfsvfs);
 	}
 	/*
 	 * Ignore errors; we can't do anything useful if either getzfsvfs or
 	 * VFS_STATFS fails.
 	 */
 }
 
 uint64_t
 zfs_max_nvlist_src_size_os(void)
 {
 	if (zfs_max_nvlist_src_size != 0)
 		return (zfs_max_nvlist_src_size);
 
 	return (ptob(vm_page_max_user_wired) / 4);
 }
 
 void
 zfs_ioctl_init_os(void)
 {
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
 	zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
 	    zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
 	    POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3);
 
 }
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index 0a179fffb16a..5cd97b9faca5 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -1,5200 +1,5247 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 #include <sys/trace_zfs.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/vdev.h>
 #include <cityhash.h>
 #include <sys/spa_impl.h>
 #include <sys/wmsum.h>
 #include <sys/vdev_impl.h>
 
 static kstat_t *dbuf_ksp;
 
 typedef struct dbuf_stats {
 	/*
 	 * Various statistics about the size of the dbuf cache.
 	 */
 	kstat_named_t cache_count;
 	kstat_named_t cache_size_bytes;
 	kstat_named_t cache_size_bytes_max;
 	/*
 	 * Statistics regarding the bounds on the dbuf cache size.
 	 */
 	kstat_named_t cache_target_bytes;
 	kstat_named_t cache_lowater_bytes;
 	kstat_named_t cache_hiwater_bytes;
 	/*
 	 * Total number of dbuf cache evictions that have occurred.
 	 */
 	kstat_named_t cache_total_evicts;
 	/*
 	 * The distribution of dbuf levels in the dbuf cache and
 	 * the total size of all dbufs at each level.
 	 */
 	kstat_named_t cache_levels[DN_MAX_LEVELS];
 	kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
 	/*
 	 * Statistics about the dbuf hash table.
 	 */
 	kstat_named_t hash_hits;
 	kstat_named_t hash_misses;
 	kstat_named_t hash_collisions;
 	kstat_named_t hash_elements;
 	kstat_named_t hash_elements_max;
 	/*
 	 * Number of sublists containing more than one dbuf in the dbuf
 	 * hash table. Keep track of the longest hash chain.
 	 */
 	kstat_named_t hash_chains;
 	kstat_named_t hash_chain_max;
 	/*
 	 * Number of times a dbuf_create() discovers that a dbuf was
 	 * already created and in the dbuf hash table.
 	 */
 	kstat_named_t hash_insert_race;
 	/*
 	 * Number of entries in the hash table dbuf and mutex arrays.
 	 */
 	kstat_named_t hash_table_count;
 	kstat_named_t hash_mutex_count;
 	/*
 	 * Statistics about the size of the metadata dbuf cache.
 	 */
 	kstat_named_t metadata_cache_count;
 	kstat_named_t metadata_cache_size_bytes;
 	kstat_named_t metadata_cache_size_bytes_max;
 	/*
 	 * For diagnostic purposes, this is incremented whenever we can't add
 	 * something to the metadata cache because it's full, and instead put
 	 * the data in the regular dbuf cache.
 	 */
 	kstat_named_t metadata_cache_overflow;
 } dbuf_stats_t;
 
 dbuf_stats_t dbuf_stats = {
 	{ "cache_count",			KSTAT_DATA_UINT64 },
 	{ "cache_size_bytes",			KSTAT_DATA_UINT64 },
 	{ "cache_size_bytes_max",		KSTAT_DATA_UINT64 },
 	{ "cache_target_bytes",			KSTAT_DATA_UINT64 },
 	{ "cache_lowater_bytes",		KSTAT_DATA_UINT64 },
 	{ "cache_hiwater_bytes",		KSTAT_DATA_UINT64 },
 	{ "cache_total_evicts",			KSTAT_DATA_UINT64 },
 	{ { "cache_levels_N",			KSTAT_DATA_UINT64 } },
 	{ { "cache_levels_bytes_N",		KSTAT_DATA_UINT64 } },
 	{ "hash_hits",				KSTAT_DATA_UINT64 },
 	{ "hash_misses",			KSTAT_DATA_UINT64 },
 	{ "hash_collisions",			KSTAT_DATA_UINT64 },
 	{ "hash_elements",			KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",			KSTAT_DATA_UINT64 },
 	{ "hash_chains",			KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
 	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
 	{ "hash_table_count",			KSTAT_DATA_UINT64 },
 	{ "hash_mutex_count",			KSTAT_DATA_UINT64 },
 	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
 	{ "metadata_cache_overflow",		KSTAT_DATA_UINT64 }
 };
 
 struct {
 	wmsum_t cache_count;
 	wmsum_t cache_total_evicts;
 	wmsum_t cache_levels[DN_MAX_LEVELS];
 	wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
 	wmsum_t hash_hits;
 	wmsum_t hash_misses;
 	wmsum_t hash_collisions;
 	wmsum_t hash_chains;
 	wmsum_t hash_insert_race;
 	wmsum_t metadata_cache_count;
 	wmsum_t metadata_cache_overflow;
 } dbuf_sums;
 
 #define	DBUF_STAT_INCR(stat, val)	\
 	wmsum_add(&dbuf_sums.stat, val);
 #define	DBUF_STAT_DECR(stat, val)	\
 	DBUF_STAT_INCR(stat, -(val));
 #define	DBUF_STAT_BUMP(stat)		\
 	DBUF_STAT_INCR(stat, 1);
 #define	DBUF_STAT_BUMPDOWN(stat)	\
 	DBUF_STAT_INCR(stat, -1);
 #define	DBUF_STAT_MAX(stat, v) {					\
 	uint64_t _m;							\
 	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
 	    (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
 		continue;						\
 }
 
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
 static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
 
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_kmem_cache;
 static taskq_t *dbu_evict_taskq;
 
 static kthread_t *dbuf_cache_evict_thread;
 static kmutex_t dbuf_evict_lock;
 static kcondvar_t dbuf_evict_cv;
 static boolean_t dbuf_evict_thread_exit;
 
 /*
  * There are two dbuf caches; each dbuf can only be in one of them at a time.
  *
  * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
  *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
  *    that represent the metadata that describes filesystems/snapshots/
  *    bookmarks/properties/etc. We only evict from this cache when we export a
  *    pool, to short-circuit as much I/O as possible for all administrative
  *    commands that need the metadata. There is no eviction policy for this
  *    cache, because we try to only include types in it which would occupy a
  *    very small amount of space per object but create a large impact on the
  *    performance of these commands. Instead, after it reaches a maximum size
  *    (which should only happen on very small memory systems with a very large
  *    number of filesystem objects), we stop taking new dbufs into the
  *    metadata cache, instead putting them in the normal dbuf cache.
  *
  * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
  *    are not currently held but have been recently released. These dbufs
  *    are not eligible for arc eviction until they are aged out of the cache.
  *    Dbufs that are aged out of the cache will be immediately destroyed and
  *    become eligible for arc eviction.
  *
  * Dbufs are added to these caches once the last hold is released. If a dbuf is
  * later accessed and still exists in the dbuf cache, then it will be removed
  * from the cache and later re-added to the head of the cache.
  *
  * If a given dbuf meets the requirements for the metadata cache, it will go
  * there, otherwise it will be considered for the generic LRU dbuf cache. The
  * caches and the refcounts tracking their sizes are stored in an array indexed
  * by those caches' matching enum values (from dbuf_cached_state_t).
  */
 typedef struct dbuf_cache {
 	multilist_t cache;
 	zfs_refcount_t size ____cacheline_aligned;
 } dbuf_cache_t;
 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
 /* Size limits for the caches */
 static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
 static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
 
 /* Set the default sizes of the caches to log2 fraction of arc size */
 static uint_t dbuf_cache_shift = 5;
 static uint_t dbuf_metadata_cache_shift = 6;
 
 /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
 static uint_t dbuf_mutex_cache_shift = 0;
 
 static unsigned long dbuf_cache_target_bytes(void);
 static unsigned long dbuf_metadata_cache_target_bytes(void);
 
 /*
  * The LRU dbuf cache uses a three-stage eviction policy:
  *	- A low water marker designates when the dbuf eviction thread
  *	should stop evicting from the dbuf cache.
  *	- When we reach the maximum size (aka mid water mark), we
  *	signal the eviction thread to run.
  *	- The high water mark indicates when the eviction thread
  *	is unable to keep up with the incoming load and eviction must
  *	happen in the context of the calling thread.
  *
  * The dbuf cache:
  *                                                 (max size)
  *                                      low water   mid water   hi water
  * +----------------------------------------+----------+----------+
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * +----------------------------------------+----------+----------+
  *                                        stop        signal     evict
  *                                      evicting     eviction   directly
  *                                                    thread
  *
  * The high and low water marks indicate the operating range for the eviction
  * thread. The low water mark is, by default, 90% of the total size of the
  * cache and the high water mark is at 110% (both of these percentages can be
  * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
  * respectively). The eviction thread will try to ensure that the cache remains
  * within this range by waking up every second and checking if the cache is
  * above the low water mark. The thread can also be woken up by callers adding
  * elements into the cache if the cache is larger than the mid water (i.e max
  * cache size). Once the eviction thread is woken up and eviction is required,
  * it will continue evicting buffers until it's able to reduce the cache size
  * to the low water mark. If the cache size continues to grow and hits the high
  * water mark, then callers adding elements to the cache will begin to evict
  * directly from the cache until the cache is no longer above the high water
  * mark.
  */
 
 /*
  * The percentage above and below the maximum cache size.
  */
 static uint_t dbuf_cache_hiwater_pct = 10;
 static uint_t dbuf_cache_lowater_pct = 10;
 
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	dmu_buf_impl_t *db = vdb;
 	memset(db, 0, sizeof (dmu_buf_impl_t));
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&db->db_cache_link);
 	zfs_refcount_create(&db->db_holds);
 
 	return (0);
 }
 
 static void
 dbuf_dest(void *vdb, void *unused)
 {
 	(void) unused;
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	rw_destroy(&db->db_rwlock);
 	cv_destroy(&db->db_changed);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 	zfs_refcount_destroy(&db->db_holds);
 }
 
 /*
  * dbuf hash table routines
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 {
 	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
 }
 
 #define	DTRACE_SET_STATE(db, why) \
 	DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db,	\
 	    const char *, why)
 
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
 	(dbuf)->db_level == (level) &&			\
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
     uint64_t *hash_out)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv;
 	uint64_t idx;
 	dmu_buf_impl_t *db;
 
 	hv = dbuf_hash(os, obj, level, blkid);
 	idx = hv & h->hash_table_mask;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	if (hash_out != NULL)
 		*hash_out = hv;
 	return (NULL);
 }
 
 static dmu_buf_impl_t *
 dbuf_find_bonus(objset_t *os, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db = NULL;
 
 	if (dnode_hold(os, object, FTAG, &dn) == 0) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		if (dn->dn_bonus != NULL) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 		}
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 	}
 	return (db);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid, idx;
 	dmu_buf_impl_t *dbf;
 	uint32_t i;
 
 	blkid = db->db_blkid;
 	ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
 	idx = db->db_hash & h->hash_table_mask;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
 	    dbf = dbf->db_hash_next, i++) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
 		}
 	}
 
 	if (i > 0) {
 		DBUF_STAT_BUMP(hash_collisions);
 		if (i == 1)
 			DBUF_STAT_BUMP(hash_chains);
 
 		DBUF_STAT_MAX(hash_chain_max, i);
 	}
 
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
 	DBUF_STAT_MAX(hash_elements_max, he);
 
 	return (NULL);
 }
 
 /*
  * This returns whether this dbuf should be stored in the metadata cache, which
  * is based on whether it's from one of the dnode types that store data related
  * to traversing dataset hierarchies.
  */
 static boolean_t
 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
 {
 	DB_DNODE_ENTER(db);
 	dmu_object_type_t type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	/* Check if this dbuf is one of the types we care about */
 	if (DMU_OT_IS_METADATA_CACHED(type)) {
 		/* If we hit this, then we set something up wrong in dmu_ot */
 		ASSERT(DMU_OT_IS_METADATA(type));
 
 		/*
 		 * Sanity check for small-memory systems: don't allocate too
 		 * much memory for this purpose.
 		 */
 		if (zfs_refcount_count(
 		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
 		    dbuf_metadata_cache_target_bytes()) {
 			DBUF_STAT_BUMP(metadata_cache_overflow);
 			return (B_FALSE);
 		}
 
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t idx;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
 	    db->db_blkid), ==, db->db_hash);
 	idx = db->db_hash & h->hash_table_mask;
 
 	/*
 	 * We mustn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
 		ASSERT(dbf != NULL);
 	}
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	if (h->hash_table[idx] &&
 	    h->hash_table[idx]->db_hash_next == NULL)
 		DBUF_STAT_BUMPDOWN(hash_chains);
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
 }
 
 typedef enum {
 	DBVU_EVICTING,
 	DBVU_NOT_EVICTING
 } dbvu_verify_type_t;
 
 static void
 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
 {
 #ifdef ZFS_DEBUG
 	int64_t holds;
 
 	if (db->db_user == NULL)
 		return;
 
 	/* Only data blocks support the attachment of user data. */
 	ASSERT(db->db_level == 0);
 
 	/* Clients must resolve a dbuf before attaching user data. */
 	ASSERT(db->db.db_data != NULL);
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 
 	holds = zfs_refcount_count(&db->db_holds);
 	if (verify_type == DBVU_EVICTING) {
 		/*
 		 * Immediate eviction occurs when holds == dirtycnt.
 		 * For normal eviction buffers, holds is zero on
 		 * eviction, except when dbuf_fix_old_data() calls
 		 * dbuf_clear_data().  However, the hold count can grow
 		 * during eviction even though db_mtx is held (see
 		 * dmu_bonus_hold() for an example), so we can only
 		 * test the generic invariant that holds >= dirtycnt.
 		 */
 		ASSERT3U(holds, >=, db->db_dirtycnt);
 	} else {
 		if (db->db_user_immediate_evict == TRUE)
 			ASSERT3U(holds, >=, db->db_dirtycnt);
 		else
 			ASSERT3U(holds, >, 0);
 	}
 #endif
 }
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
 	dmu_buf_user_t *dbu = db->db_user;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (dbu == NULL)
 		return;
 
 	dbuf_verify_user(db, DBVU_EVICTING);
 	db->db_user = NULL;
 
 #ifdef ZFS_DEBUG
 	if (dbu->dbu_clear_on_evict_dbufp != NULL)
 		*dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
+	if (db->db_caching_status != DB_NO_CACHE) {
+		/*
+		 * This is a cached dbuf, so the size of the user data is
+		 * included in its cached amount. We adjust it here because the
+		 * user data has already been detached from the dbuf, and the
+		 * sync functions are not supposed to touch it (the dbuf might
+		 * not exist anymore by the time the sync functions run.
+		 */
+		uint64_t size = dbu->dbu_size;
+		(void) zfs_refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size, size, db);
+		if (db->db_caching_status == DB_DBUF_CACHE)
+			DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+	}
+
 	/*
 	 * There are two eviction callbacks - one that we call synchronously
 	 * and one that we invoke via a taskq.  The async one is useful for
 	 * avoiding lock order reversals and limiting stack depth.
 	 *
 	 * Note that if we have a sync callback but no async callback,
 	 * it's likely that the sync callback will free the structure
 	 * containing the dbu.  In that case we need to take care to not
 	 * dereference dbu after calling the sync evict func.
 	 */
 	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
 
 	if (dbu->dbu_evict_func_sync != NULL)
 		dbu->dbu_evict_func_sync(dbu);
 
 	if (has_async) {
 		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
 		    dbu, 0, &dbu->dbu_tqent);
 	}
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
 	/*
 	 * Consider indirect blocks and spill blocks to be meta data.
 	 */
 	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
 		return (B_TRUE);
 	} else {
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
 	}
 }
 
 /*
  * We want to exclude buffers that are on a special allocation class from
  * L2ARC.
  */
 boolean_t
 dbuf_is_l2cacheable(dmu_buf_impl_t *db)
 {
 	if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
 	    (db->db_objset->os_secondary_cache ==
 	    ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
 		blkptr_t *bp = db->db_blkptr;
 		if (bp == NULL || BP_IS_HOLE(bp))
 			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
 		if (vd == NULL)
 			return (B_TRUE);
 
 		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
 		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static inline boolean_t
 dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
 {
 	if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
 	    (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
 	    (level > 0 ||
 	    DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
 		if (bp == NULL || BP_IS_HOLE(bp))
 			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
 		if (vd == NULL)
 			return (B_TRUE);
 
 		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
 		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the dbuf eviction
  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
 {
 	dmu_buf_impl_t *db = obj;
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * dmu_buf_impl_t will remain constant throughout it's lifetime
 	 * (i.e. it's objset, object, level and blkid fields don't change).
 	 * Thus, we don't need to store the dbuf's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid) %
 	    multilist_get_num_sublists(ml));
 }
 
 /*
  * The target size of the dbuf cache can grow with the ARC target,
  * unless limited by the tunable dbuf_cache_max_bytes.
  */
 static inline unsigned long
 dbuf_cache_target_bytes(void)
 {
 	return (MIN(dbuf_cache_max_bytes,
 	    arc_target_bytes() >> dbuf_cache_shift));
 }
 
 /*
  * The target size of the dbuf metadata cache can grow with the ARC target,
  * unless limited by the tunable dbuf_metadata_cache_max_bytes.
  */
 static inline unsigned long
 dbuf_metadata_cache_target_bytes(void)
 {
 	return (MIN(dbuf_metadata_cache_max_bytes,
 	    arc_target_bytes() >> dbuf_metadata_cache_shift));
 }
 
 static inline uint64_t
 dbuf_cache_hiwater_bytes(void)
 {
 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 	return (dbuf_cache_target +
 	    (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
 }
 
 static inline uint64_t
 dbuf_cache_lowater_bytes(void)
 {
 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 	return (dbuf_cache_target -
 	    (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
 }
 
 static inline boolean_t
 dbuf_cache_above_lowater(void)
 {
 	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_lowater_bytes());
 }
 
 /*
  * Evict the oldest eligible dbuf from the dbuf cache.
  */
 static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
 	multilist_sublist_t *mls = multilist_sublist_lock(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
 	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 		db = multilist_sublist_prev(mls, db);
 	}
 
 	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
 	    multilist_sublist_t *, mls);
 
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
+		uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
 		(void) zfs_refcount_remove_many(
-		    &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+		    &dbuf_caches[DB_DBUF_CACHE].size, size, db);
 		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 		DBUF_STAT_BUMPDOWN(cache_count);
-		DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-		    db->db.db_size);
+		DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
 		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
 		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
 		DBUF_STAT_BUMP(cache_total_evicts);
 	} else {
 		multilist_sublist_unlock(mls);
 	}
 }
 
 /*
  * The dbuf evict thread is responsible for aging out dbufs from the
  * cache. Once the cache has reached it's maximum size, dbufs are removed
  * and destroyed. The eviction thread will continue running until the size
  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
  * out of the cache it is destroyed and becomes eligible for arc eviction.
  */
 static __attribute__((noreturn)) void
 dbuf_evict_thread(void *unused)
 {
 	(void) unused;
 	callb_cpr_t cpr;
 
 	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&dbuf_evict_lock);
 	while (!dbuf_evict_thread_exit) {
 		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			CALLB_CPR_SAFE_BEGIN(&cpr);
 			(void) cv_timedwait_idle_hires(&dbuf_evict_cv,
 			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
 			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
 		}
 		mutex_exit(&dbuf_evict_lock);
 
 		/*
 		 * Keep evicting as long as we're above the low water mark
 		 * for the cache. We do this without holding the locks to
 		 * minimize lock contention.
 		 */
 		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			dbuf_evict_one();
 		}
 
 		mutex_enter(&dbuf_evict_lock);
 	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	cv_broadcast(&dbuf_evict_cv);
 	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
 	thread_exit();
 }
 
 /*
  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
  * If the dbuf cache is at its high water mark, then evict a dbuf from the
  * dbuf cache using the caller's context.
  */
 static void
 dbuf_evict_notify(uint64_t size)
 {
 	/*
 	 * We check if we should evict without holding the dbuf_evict_lock,
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
 	 */
 	if (size > dbuf_cache_target_bytes()) {
 		if (size > dbuf_cache_hiwater_bytes())
 			dbuf_evict_one();
 		cv_signal(&dbuf_evict_cv);
 	}
 }
 
 static int
 dbuf_kstat_update(kstat_t *ksp, int rw)
 {
 	dbuf_stats_t *ds = ksp->ks_data;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	ds->cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.cache_count);
 	ds->cache_size_bytes.value.ui64 =
 	    zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
 	ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
 	ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
 	ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
 	ds->cache_total_evicts.value.ui64 =
 	    wmsum_value(&dbuf_sums.cache_total_evicts);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		ds->cache_levels[i].value.ui64 =
 		    wmsum_value(&dbuf_sums.cache_levels[i]);
 		ds->cache_levels_bytes[i].value.ui64 =
 		    wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
 	}
 	ds->hash_hits.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_hits);
 	ds->hash_misses.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_misses);
 	ds->hash_collisions.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_collisions);
 	ds->hash_chains.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_chains);
 	ds->hash_insert_race.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_insert_race);
 	ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
 	ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
 	ds->metadata_cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_count);
 	ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
 	    &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
 	ds->metadata_cache_overflow.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_overflow);
 	return (0);
 }
 
 void
 dbuf_init(void)
 {
 	uint64_t hmsize, hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	/*
 	 * The hash table is big enough to fill one eighth of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
 		hsize <<= 1;
 
 	h->hash_table = NULL;
 	while (h->hash_table == NULL) {
 		h->hash_table_mask = hsize - 1;
 
 		h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
 		if (h->hash_table == NULL)
 			hsize >>= 1;
 
 		ASSERT3U(hsize, >=, 1ULL << 10);
 	}
 
 	/*
 	 * The hash table buckets are protected by an array of mutexes where
 	 * each mutex is reponsible for protecting 128 buckets.  A minimum
 	 * array size of 8192 is targeted to avoid contention.
 	 */
 	if (dbuf_mutex_cache_shift == 0)
 		hmsize = MAX(hsize >> 7, 1ULL << 13);
 	else
 		hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
 
 	h->hash_mutexes = NULL;
 	while (h->hash_mutexes == NULL) {
 		h->hash_mutex_mask = hmsize - 1;
 
 		h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
 		    KM_SLEEP);
 		if (h->hash_mutexes == NULL)
 			hmsize >>= 1;
 	}
 
 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (int i = 0; i < hmsize; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	dbuf_stats_init(h);
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
 	 * configuration is not required.
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
 
 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 		multilist_create(&dbuf_caches[dcs].cache,
 		    sizeof (dmu_buf_impl_t),
 		    offsetof(dmu_buf_impl_t, db_cache_link),
 		    dbuf_cache_multilist_index_func);
 		zfs_refcount_create(&dbuf_caches[dcs].size);
 	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
 	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
 	    NULL, 0, &p0, TS_RUN, minclsyspri);
 
 	wmsum_init(&dbuf_sums.cache_count, 0);
 	wmsum_init(&dbuf_sums.cache_total_evicts, 0);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_init(&dbuf_sums.cache_levels[i], 0);
 		wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
 	}
 	wmsum_init(&dbuf_sums.hash_hits, 0);
 	wmsum_init(&dbuf_sums.hash_misses, 0);
 	wmsum_init(&dbuf_sums.hash_collisions, 0);
 	wmsum_init(&dbuf_sums.hash_chains, 0);
 	wmsum_init(&dbuf_sums.hash_insert_race, 0);
 	wmsum_init(&dbuf_sums.metadata_cache_count, 0);
 	wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
 
 	dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dbuf_ksp != NULL) {
 		for (int i = 0; i < DN_MAX_LEVELS; i++) {
 			snprintf(dbuf_stats.cache_levels[i].name,
 			    KSTAT_STRLEN, "cache_level_%d", i);
 			dbuf_stats.cache_levels[i].data_type =
 			    KSTAT_DATA_UINT64;
 			snprintf(dbuf_stats.cache_levels_bytes[i].name,
 			    KSTAT_STRLEN, "cache_level_%d_bytes", i);
 			dbuf_stats.cache_levels_bytes[i].data_type =
 			    KSTAT_DATA_UINT64;
 		}
 		dbuf_ksp->ks_data = &dbuf_stats;
 		dbuf_ksp->ks_update = dbuf_kstat_update;
 		kstat_install(dbuf_ksp);
 	}
 }
 
 void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	dbuf_stats_destroy();
 
 	for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 
 	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 	vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
 	    sizeof (kmutex_t));
 
 	kmem_cache_destroy(dbuf_kmem_cache);
 	taskq_destroy(dbu_evict_taskq);
 
 	mutex_enter(&dbuf_evict_lock);
 	dbuf_evict_thread_exit = B_TRUE;
 	while (dbuf_evict_thread_exit) {
 		cv_signal(&dbuf_evict_cv);
 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 	}
 	mutex_exit(&dbuf_evict_lock);
 
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
 
 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 		zfs_refcount_destroy(&dbuf_caches[dcs].size);
 		multilist_destroy(&dbuf_caches[dcs].cache);
 	}
 
 	if (dbuf_ksp != NULL) {
 		kstat_delete(dbuf_ksp);
 		dbuf_ksp = NULL;
 	}
 
 	wmsum_fini(&dbuf_sums.cache_count);
 	wmsum_fini(&dbuf_sums.cache_total_evicts);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_fini(&dbuf_sums.cache_levels[i]);
 		wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
 	}
 	wmsum_fini(&dbuf_sums.hash_hits);
 	wmsum_fini(&dbuf_sums.hash_misses);
 	wmsum_fini(&dbuf_sums.hash_collisions);
 	wmsum_fini(&dbuf_sums.hash_chains);
 	wmsum_fini(&dbuf_sums.hash_insert_race);
 	wmsum_fini(&dbuf_sums.metadata_cache_count);
 	wmsum_fini(&dbuf_sums.metadata_cache_overflow);
 }
 
 /*
  * Other stuff.
  */
 
 #ifdef ZFS_DEBUG
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 	uint32_t txg_prev;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 		return;
 
 	ASSERT(db->db_objset != NULL);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
 		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
 	if ((dr = list_head(&db->db_dirty_records)) != NULL) {
 		ASSERT(dr->dr_dbuf == db);
 		txg_prev = dr->dr_txg;
 		for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
 		    dr = list_next(&db->db_dirty_records, dr)) {
 			ASSERT(dr->dr_dbuf == db);
 			ASSERT(txg_prev > dr->dr_txg);
 			txg_prev = dr->dr_txg;
 		}
 	}
 
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
 		 */
 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 	}
 
 	/* verify db->db_blkptr */
 	if (db->db_blkptr) {
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
 			if (db->db_blkid != DMU_SPILL_BLKID)
 				ASSERT3P(db->db_blkptr, ==,
 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb __maybe_unused = db->db_parent->db.db_size >>
 			    SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
 			/*
 			 * dnode_grow_indblksz() can make this fail if we don't
 			 * have the parent's rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
 			if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
 			}
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
 		 *
 		 * There is an exception to this rule for indirect blocks; in
 		 * this case, if the indirect block is a hole, we fill in a few
 		 * fields on each of the child blocks (importantly, birth time)
 		 * to prevent hole birth times from being lost when you
 		 * partially fill in a hole.
 		 */
 		if (db->db_dirtycnt == 0) {
 			if (db->db_level == 0) {
 				uint64_t *buf = db->db.db_data;
 				int i;
 
 				for (i = 0; i < db->db.db_size >> 3; i++) {
 					ASSERT(buf[i] == 0);
 				}
 			} else {
 				blkptr_t *bps = db->db.db_data;
 				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
 				    db->db.db_size);
 				/*
 				 * We want to verify that all the blkptrs in the
 				 * indirect block are holes, but we may have
 				 * automatically set up a few fields for them.
 				 * We iterate through each blkptr and verify
 				 * they only have those fields set.
 				 */
 				for (int i = 0;
 				    i < db->db.db_size / sizeof (blkptr_t);
 				    i++) {
 					blkptr_t *bp = &bps[i];
 					ASSERT(ZIO_CHECKSUM_IS_ZERO(
 					    &bp->blk_cksum));
 					ASSERT(
 					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[2]));
 					ASSERT0(bp->blk_fill);
 					ASSERT0(bp->blk_pad[0]);
 					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
 					ASSERT0(bp->blk_phys_birth);
 				}
 			}
 		}
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
 dbuf_clear_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	dbuf_evict_user(db);
 	ASSERT3P(db->db_buf, ==, NULL);
 	db->db.db_data = NULL;
 	if (db->db_state != DB_NOFILL) {
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "clear data");
 	}
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(buf != NULL);
 
 	db->db_buf = buf;
 	ASSERT(buf->b_data != NULL);
 	db->db.db_data = buf->b_data;
 }
 
 static arc_buf_t *
 dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
 {
 	spa_t *spa = db->db_objset->os_spa;
 
 	return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */
 arc_buf_t *
 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
 		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, B_FALSE, blksz);
 		memcpy(abuf->b_data, db->db.db_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
 /*
  * Calculate which level n block references the data at the level 0 offset
  * provided.
  */
 uint64_t
 dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
 {
 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
 		/*
 		 * The level n blkid is equal to the level 0 blkid divided by
 		 * the number of level 0s in a level n block.
 		 *
 		 * The level 0 blkid is offset >> datablkshift =
 		 * offset / 2^datablkshift.
 		 *
 		 * The number of level 0s in a level n is the number of block
 		 * pointers in an indirect block, raised to the power of level.
 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
 		 *
 		 * Thus, the level n blkid is: offset /
 		 * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
 		 * = offset / 2^(datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 * = offset >> (datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 */
 
 		const unsigned exp = dn->dn_datablkshift +
 		    level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 
 		if (exp >= 8 * sizeof (offset)) {
 			/* This only happens on the highest indirection level */
 			ASSERT3U(level, ==, dn->dn_nlevels - 1);
 			return (0);
 		}
 
 		ASSERT3U(exp, <, 8 * sizeof (offset));
 
 		return (offset >> exp);
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
 	}
 }
 
 /*
  * This function is used to lock the parent of the provided dbuf. This should be
  * used when modifying or reading db_blkptr.
  */
 db_lock_type_t
 dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
 {
 	enum db_lock_type ret = DLT_NONE;
 	if (db->db_parent != NULL) {
 		rw_enter(&db->db_parent->db_rwlock, rw);
 		ret = DLT_PARENT;
 	} else if (dmu_objset_ds(db->db_objset) != NULL) {
 		rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
 		    tag);
 		ret = DLT_OBJSET;
 	}
 	/*
 	 * We only return a DLT_NONE lock when it's the top-most indirect block
 	 * of the meta-dnode of the MOS.
 	 */
 	return (ret);
 }
 
 /*
  * We need to pass the lock type in because it's possible that the block will
  * move from being the topmost indirect block in a dnode (and thus, have no
  * parent) to not the top-most via an indirection increase. This would cause a
  * panic if we didn't pass the lock type in.
  */
 void
 dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
 {
 	if (type == DLT_PARENT)
 		rw_exit(&db->db_parent->db_rwlock);
 	else if (type == DLT_OBJSET)
 		rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
 }
 
 static void
 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *vdb)
 {
 	(void) zb, (void) bp;
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(zfs_refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (buf == NULL) {
 		/* i/o error */
 		ASSERT(zio == NULL || zio->io_error != 0);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "i/o error");
 	} else if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* freed in flight */
 		ASSERT(zio == NULL || zio->io_error == 0);
 		arc_release(buf, db);
 		memset(buf->b_data, 0, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "freed in flight");
 	} else {
 		/* success */
 		ASSERT(zio == NULL || zio->io_error == 0);
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "successful read");
 	}
 	cv_broadcast(&db->db_changed);
 	dbuf_rele_and_unlock(db, NULL, B_FALSE);
 }
 
 /*
  * Shortcut for performing reads on bonus dbufs.  Returns
  * an error if we fail to verify the dnode associated with
  * a decrypted block. Otherwise success.
  */
 static int
 dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 {
 	int bonuslen, max_bonuslen, err;
 
 	err = dbuf_read_verify_dnode_crypt(db, flags);
 	if (err)
 		return (err);
 
 	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(DB_DNODE_HELD(db));
 	ASSERT3U(bonuslen, <=, db->db.db_size);
 	db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
 	arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
 	if (bonuslen < max_bonuslen)
 		memset(db->db.db_data, 0, max_bonuslen);
 	if (bonuslen)
 		memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
 	db->db_state = DB_CACHED;
 	DTRACE_SET_STATE(db, "bonus buffer filled");
 	return (0);
 }
 
 static void
 dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 {
 	blkptr_t *bps = db->db.db_data;
 	uint32_t indbs = 1ULL << dn->dn_indblkshift;
 	int n_bps = indbs >> SPA_BLKPTRSHIFT;
 
 	for (int i = 0; i < n_bps; i++) {
 		blkptr_t *bp = &bps[i];
 
 		ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
 		BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
 		    dn->dn_datablksz : BP_GET_LSIZE(dbbp));
 		BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
 		BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
 		BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
 	}
 }
 
 /*
  * Handle reads on dbufs that are holes, if necessary.  This function
  * requires that the dbuf's mutex is held. Returns success (0) if action
  * was taken, ENOENT if no action was taken.
  */
 static int
 dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	int is_hole = bp == NULL || BP_IS_HOLE(bp);
 	/*
 	 * For level 0 blocks only, if the above check fails:
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
 	if (!is_hole && db->db_level == 0)
 		is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
 
 	if (is_hole) {
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
 		memset(db->db.db_data, 0, db->db.db_size);
 
 		if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
 		    bp->blk_birth != 0) {
 			dbuf_handle_indirect_hole(db, dn, bp);
 		}
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "hole read satisfied");
 		return (0);
 	}
 	return (ENOENT);
 }
 
 /*
  * This function ensures that, when doing a decrypting read of a block,
  * we make sure we have decrypted the dnode associated with it. We must do
  * this so that we ensure we are fully authenticating the checksum-of-MACs
  * tree from the root of the objset down to this block. Indirect blocks are
  * always verified against their secure checksum-of-MACs assuming that the
  * dnode containing them is correct. Now that we are doing a decrypting read,
  * we can be sure that the key is loaded and verify that assumption. This is
  * especially important considering that we always read encrypted dnode
  * blocks as raw data (without verifying their MACs) to start, and
  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
  */
 static int
 dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 {
 	int err = 0;
 	objset_t *os = db->db_objset;
 	arc_buf_t *dnode_abuf;
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if ((flags & DB_RF_NO_DECRYPT) != 0 ||
 	    !os->os_encrypted || os->os_raw_receive)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
 
 	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
 		DB_DNODE_EXIT(db);
 		return (0);
 	}
 
 	SET_BOOKMARK(&zb, dmu_objset_id(os),
 	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
 	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
 
 	/*
 	 * An error code of EACCES tells us that the key is still not
 	 * available. This is ok if we are only reading authenticated
 	 * (and therefore non-encrypted) blocks.
 	 */
 	if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
 	    (db->db_blkid == DMU_BONUS_BLKID &&
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
 		err = 0;
 
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Drops db_mtx and the parent lock specified by dblt and tag before
  * returning.
  */
 static int
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
     db_lock_type_t dblt, const void *tag)
 {
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
 	blkptr_t bp, *bpp;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_parent == NULL ||
 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		err = dbuf_read_bonus(db, dn, flags);
 		goto early_unlock;
 	}
 
 	if (db->db_state == DB_UNCACHED) {
 		if (db->db_blkptr == NULL) {
 			bpp = NULL;
 		} else {
 			bp = *db->db_blkptr;
 			bpp = &bp;
 		}
 	} else {
 		dbuf_dirty_record_t *dr;
 
 		ASSERT3S(db->db_state, ==, DB_NOFILL);
 
 		/*
 		 * Block cloning: If we have a pending block clone,
 		 * we don't want to read the underlying block, but the content
 		 * of the block being cloned, so we have the most recent data.
 		 */
 		dr = list_head(&db->db_dirty_records);
 		if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
 			err = EIO;
 			goto early_unlock;
 		}
 		bp = dr->dt.dl.dr_overridden_by;
 		bpp = &bp;
 	}
 
 	err = dbuf_read_hole(db, dn, bpp);
 	if (err == 0)
 		goto early_unlock;
 
 	ASSERT(bpp != NULL);
 
 	/*
 	 * Any attempt to read a redacted block should result in an error. This
 	 * will never happen under normal conditions, but can be useful for
 	 * debugging purposes.
 	 */
 	if (BP_IS_REDACTED(bpp)) {
 		ASSERT(dsl_dataset_feature_is_active(
 		    db->db_objset->os_dsl_dataset,
 		    SPA_FEATURE_REDACTED_DATASETS));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
 
 	SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	/*
 	 * All bps of an encrypted os should have the encryption bit set.
 	 * If this is not true it indicates tampering and we report an error.
 	 */
 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
 		spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth);
 		zfs_panic_recover("unencrypted block in encrypted "
 		    "object set %llu", dmu_objset_id(db->db_objset));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
 
 	err = dbuf_read_verify_dnode_crypt(db, flags);
 	if (err != 0)
 		goto early_unlock;
 
 	DB_DNODE_EXIT(db);
 
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
 
 	if (!DBUF_IS_CACHEABLE(db))
 		aflags |= ARC_FLAG_UNCACHED;
 	else if (dbuf_is_l2cacheable(db))
 		aflags |= ARC_FLAG_L2CACHE;
 
 	dbuf_add_ref(db, NULL);
 
 	zio_flags = (flags & DB_RF_CANFAIL) ?
 	    ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
 
 	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
 		zio_flags |= ZIO_FLAG_RAW;
 	/*
 	 * The zio layer will copy the provided blkptr later, but we have our
 	 * own copy so that we can release the parent's rwlock. We have to
 	 * do that so that if dbuf_read_done is called synchronously (on
 	 * an l1 cache hit) we don't acquire the db_mtx while holding the
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	dmu_buf_unlock_parent(db, dblt, tag);
 	(void) arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
 	    &aflags, &zb);
 	return (err);
 early_unlock:
 	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
 }
 
 /*
  * This is our just-in-time copy function.  It makes a copy of buffers that
  * have been modified in a previous transaction group before we access them in
  * the current active group.
  *
  * This function is used in three places: when we are dirtying a buffer for the
  * first time in a txg, when we are freeing a range in a dnode that includes
  * this buffer, and when we are accessing a buffer which was received compressed
  * and later referenced in a WRITE_BYREF record.
  *
  * Note that when we are called from dbuf_free_range() we do not put a hold on
  * the buffer, we just traverse the active dbuf list for the dnode.
  */
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_level == 0);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
 	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT3U(dr->dr_txg, >=, txg - 2);
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dnode_t *dn = DB_DNODE(db);
 		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 		dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
 		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
 		memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
 	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		dnode_t *dn = DB_DNODE(db);
 		int size = arc_buf_size(db->db_buf);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 		enum zio_compress compress_type =
 		    arc_get_compression(db->db_buf);
 		uint8_t complevel = arc_get_complevel(db->db_buf);
 
 		if (arc_is_encrypted(db->db_buf)) {
 			boolean_t byteorder;
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t mac[ZIO_DATA_MAC_LEN];
 
 			arc_get_raw_params(db->db_buf, &byteorder, salt,
 			    iv, mac);
 			dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
 			    dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
 			    mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
 			    compress_type, complevel);
 		} else if (compress_type != ZIO_COMPRESS_OFF) {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
 			    size, arc_buf_lsize(db->db_buf), compress_type,
 			    complevel);
 		} else {
 			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
 		}
 		memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
 	} else {
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
 	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
 	 * can't be freed while we have a hold on the buffer.
 	 */
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
 
 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
 		db->db_partial_read = B_TRUE;
 	else if (!(flags & DB_RF_PARTIAL_MORE))
 		db->db_partial_read = B_FALSE;
 	if (db->db_state == DB_CACHED) {
 		/*
 		 * Ensure that this block's dnode has been decrypted if
 		 * the caller has requested decrypted data.
 		 */
 		err = dbuf_read_verify_dnode_crypt(db, flags);
 
 		/*
 		 * If the arc buf is compressed or encrypted and the caller
 		 * requested uncompressed data, we need to untransform it
 		 * before returning. We also call arc_untransform() on any
 		 * unauthenticated blocks, which will verify their MAC if
 		 * the key is now available.
 		 */
 		if (err == 0 && db->db_buf != NULL &&
 		    (flags & DB_RF_NO_DECRYPT) == 0 &&
 		    (arc_is_encrypted(db->db_buf) ||
 		    arc_is_unauthenticated(db->db_buf) ||
 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
 			spa_t *spa = dn->dn_objset->os_spa;
 			zbookmark_phys_t zb;
 
 			SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 			    db->db.db_object, db->db_level, db->db_blkid);
 			dbuf_fix_old_data(db, spa_syncing_txg(spa));
 			err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
 			dbuf_set_data(db, db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
 		if (err == 0 && prefetch) {
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
 			    B_FALSE, flags & DB_RF_HAVESTRUCT);
 		}
 		DB_DNODE_EXIT(db);
 		DBUF_STAT_BUMP(hash_hits);
 	} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
 		boolean_t need_wait = B_FALSE;
 
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
 		if (zio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
 		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
 		/*
 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
 		 * for us
 		 */
 		if (!err && prefetch) {
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
 			    db->db_state != DB_CACHED,
 			    flags & DB_RF_HAVESTRUCT);
 		}
 
 		DB_DNODE_EXIT(db);
 		DBUF_STAT_BUMP(hash_misses);
 
 		/*
 		 * If we created a zio_root we must execute it to avoid
 		 * leaking it, even if it isn't attached to any work due
 		 * to an error in dbuf_read_impl().
 		 */
 		if (need_wait) {
 			if (err == 0)
 				err = zio_wait(zio);
 			else
 				VERIFY0(zio_wait(zio));
 		}
 	} else {
 		/*
 		 * Another reader came in while the dbuf was in flight
 		 * between UNCACHED and CACHED.  Either a writer will finish
 		 * writing the buffer (sending the dbuf to CACHED) or the
 		 * first reader's request will reach the read_done callback
 		 * and send the dbuf to CACHED.  Otherwise, a failure
 		 * occurred and the dbuf went to UNCACHED.
 		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch) {
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
 			    B_TRUE, flags & DB_RF_HAVESTRUCT);
 		}
 		DB_DNODE_EXIT(db);
 		DBUF_STAT_BUMP(hash_misses);
 
 		/* Skip the wait per the caller's request. */
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
 				    db, zio_t *, zio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 		}
 	}
 
 	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
 		db->db_state = DB_FILL;
 		DTRACE_SET_STATE(db, "assigning filled buffer");
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_clear_data(db);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 	boolean_t release;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	/*
 	 * This assert is valid because dmu_sync() expects to be called by
 	 * a zilog's get_data while holding a range lock.  This call only
 	 * comes from dbuf_dirty() callers who must also hold a range lock.
 	 */
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
 	release = !dr->dt.dl.dr_brtwrite;
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 	dr->dt.dl.dr_brtwrite = B_FALSE;
 	dr->dt.dl.dr_has_raw_params = B_FALSE;
 
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
 	 * another (redundant) arc_release().  Therefore, leave
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
 	if (release)
 		arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
  * empty blocks.
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db_search;
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
 	avl_index_t where;
 	dbuf_dirty_record_t *dr;
 
 	if (end_blkid > dn->dn_maxblkid &&
 	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
 		end_blkid = dn->dn_maxblkid;
 	dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
 	    (u_longlong_t)end_blkid);
 
 	db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
 	db_search->db_level = 0;
 	db_search->db_blkid = start_blkid;
 	db_search->db_state = DB_SEARCH;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db = avl_find(&dn->dn_dbufs, db_search, &where);
 	ASSERT3P(db, ==, NULL);
 
 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 	for (; db != NULL; db = db_next) {
 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
 			break;
 		}
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
 			/* mutex has been dropped and dbuf destroyed */
 			continue;
 		}
 
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (zfs_refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
 			dbuf_destroy(db);
 			continue;
 		}
 		/* The dbuf is referenced */
 
 		dr = list_head(&db->db_dirty_records);
 		if (dr != NULL) {
 			if (dr->dr_txg == txg) {
 				/*
 				 * This buffer is "in-use", re-adjust the file
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
 				if (db->db_blkid != DMU_SPILL_BLKID &&
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
 				 * Either uncache it (if its not referenced in
 				 * the open context) or reset its contents to
 				 * empty.
 				 */
 				dbuf_fix_old_data(db, txg);
 			}
 		}
 		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			rw_enter(&db->db_rwlock, RW_WRITER);
 			memset(db->db.db_data, 0, db->db.db_size);
 			rw_exit(&db->db_rwlock);
 			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 
 	mutex_exit(&dn->dn_dbufs_mtx);
 	kmem_free(db_search, sizeof (dmu_buf_impl_t));
 }
 
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
 	arc_buf_t *buf, *old_buf;
 	dbuf_dirty_record_t *dr;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
 	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
 
 	/* copy old block data to the new block */
 	old_buf = db->db_buf;
 	memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
 		memset((uint8_t *)buf->b_data + osize, 0, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
 	arc_buf_destroy(old_buf, db);
 	db->db.db_size = size;
 
 	dr = list_head(&db->db_dirty_records);
 	/* dirty record added by dmu_buf_will_dirty() */
 	VERIFY(dr != NULL);
 	if (db->db_level == 0)
 		dr->dt.dl.dr_data = buf;
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	ASSERT3U(dr->dr_accounted, ==, osize);
 	dr->dr_accounted = size;
 	mutex_exit(&db->db_mtx);
 
 	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
 	objset_t *os __maybe_unused = db->db_objset;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
 	(void) arc_release(db->db_buf, db);
 }
 
 /*
  * We already have a dirty record for this TXG, and we are being
  * dirtied again.
  */
 static void
 dbuf_redirty(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this buffer has already been written out,
 		 * we now need to reset its state.
 		 */
 		dbuf_unoverride(dr);
 		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
 		    db->db_state != DB_NOFILL) {
 			/* Already released on initial dirty, so just thaw. */
 			ASSERT(arc_released(db->db_buf));
 			arc_buf_thaw(db->db_buf);
 		}
 	}
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
 	dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
 	ASSERT(dn->dn_maxblkid >= blkid);
 
 	dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
 	list_link_init(&dr->dr_dirty_node);
 	list_link_init(&dr->dr_dbuf_node);
 	dr->dr_dnode = dn;
 	dr->dr_txg = tx->tx_txg;
 	dr->dt.dll.dr_blkid = blkid;
 	dr->dr_accounted = dn->dn_datablksz;
 
 	/*
 	 * There should not be any dbuf for the block that we're dirtying.
 	 * Otherwise the buffer contents could be inconsistent between the
 	 * dbuf and the lightweight dirty record.
 	 */
 	ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
 	    NULL));
 
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
 	if (dn->dn_free_ranges[txgoff] != NULL) {
 		range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
 	}
 
 	if (dn->dn_nlevels == 1) {
 		ASSERT3U(blkid, <, dn->dn_nblkptr);
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_setdirty(dn, tx);
 	} else {
 		mutex_exit(&dn->dn_mtx);
 
 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 		dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
 		    1, blkid >> epbs, FTAG);
 		rw_exit(&dn->dn_struct_rwlock);
 		if (parent_db == NULL) {
 			kmem_free(dr, sizeof (*dr));
 			return (NULL);
 		}
 		int err = dbuf_read(parent_db, NULL,
 		    (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err != 0) {
 			dbuf_rele(parent_db, FTAG);
 			kmem_free(dr, sizeof (*dr));
 			return (NULL);
 		}
 
 		dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
 		dbuf_rele(parent_db, FTAG);
 		mutex_enter(&parent_dr->dt.di.dr_mtx);
 		ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
 		list_insert_tail(&parent_dr->dt.di.dr_children, dr);
 		mutex_exit(&parent_dr->dt.di.dr_mtx);
 		dr->dr_parent = parent_dr;
 	}
 
 	dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
 
 	return (dr);
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	objset_t *os;
 	dbuf_dirty_record_t *dr, *dr_next, *dr_head;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	boolean_t drop_struct_rwlock = B_FALSE;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
 #ifdef ZFS_DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL) {
 		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
 		    RW_READER, FTAG);
 	}
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * XXX make this true for indirects too?  The problem is that
 	 * transactions created with dmu_tx_create_assigned() from
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	dnode_set_dirtyctx(dn, tx, db);
 	if (tx->tx_txg > dn->dn_dirty_txg)
 		dn->dn_dirty_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
 	dr_head = list_head(&db->db_dirty_records);
 	ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
 	dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
 	if (dr_next && dr_next->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
 		dbuf_redirty(dr_next);
 		mutex_exit(&db->db_mtx);
 		return (dr_next);
 	}
 
 	/*
 	 * Only valid if not already dirty.
 	 */
 	ASSERT(dn->dn_object == 0 ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
 	 * mos or we're initializing the os or it's a special object.
 	 * However, we are allowed to dirty in syncing context provided
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
 #ifdef ZFS_DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dmu_objset_willuse_space(os, db->db.db_size, tx);
 	}
 
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
 	list_link_init(&dr->dr_dirty_node);
 	list_link_init(&dr->dr_dbuf_node);
 	dr->dr_dnode = dn;
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
 			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 				/*
 				 * Release the data buffer from the cache so
 				 * that we can modify it without impacting
 				 * possible other users of this cached data
 				 * block.  Note that indirect blocks and
 				 * private objects are not released until the
 				 * syncing state (since they are only modified
 				 * then).
 				 */
 				arc_release(db->db_buf, db);
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db_buf;
 			}
 			ASSERT(data_old != NULL);
 		}
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
 		list_create(&dr->dt.di.dr_children,
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dr->dr_accounted = db->db.db_size;
 	}
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	list_insert_before(&db->db_dirty_records, dr_next, dr);
 
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
 			range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
 
 	/*
 	 * This buffer is now part of this txg
 	 */
 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
 	db->db_dirtycnt += 1;
 	ASSERT3U(db->db_dirtycnt, <=, 3);
 
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_rwlock = B_TRUE;
 	}
 
 	/*
 	 * If we are overwriting a dedup BP, then unless it is snapshotted,
 	 * when we get to syncing context we will need to decrement its
 	 * refcount in the DDT.  Prefetch the relevant DDT block so that
 	 * syncing context won't have to wait for the i/o.
 	 */
 	if (db->db_blkptr != NULL) {
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 		ddt_prefetch(os->os_spa, db->db_blkptr);
 		dmu_buf_unlock_parent(db, dblt, FTAG);
 	}
 
 	/*
 	 * We need to hold the dn_struct_rwlock to make this assertion,
 	 * because it protects dn_phys / dn_next_nlevels from changing.
 	 */
 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
 	    dn->dn_phys->dn_nlevels > db->db_level ||
 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
 
 
 	if (db->db_level == 0) {
 		ASSERT(!db->db_objset->os_raw_receive ||
 		    dn->dn_maxblkid >= db->db_blkid);
 		dnode_new_blkid(dn, db->db_blkid, tx,
 		    drop_struct_rwlock, B_FALSE);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
 		int parent_held = FALSE;
 
 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, FTAG);
 			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_rwlock)
 			rw_exit(&dn->dn_struct_rwlock);
 		ASSERT3U(db->db_level + 1, ==, parent->db_level);
 		di = dbuf_dirty(parent, tx);
 		if (parent_held)
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
 		/*
 		 * Since we've dropped the mutex, it's possible that
 		 * dbuf_undirty() might have changed this out from under us.
 		 */
 		if (list_head(&db->db_dirty_records) == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
 			ASSERT(!list_link_active(&dr->dr_dirty_node));
 			list_insert_tail(&di->dt.di.dr_children, dr);
 			mutex_exit(&di->dt.di.dr_mtx);
 			dr->dr_parent = di;
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_level + 1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_rwlock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
 	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 static void
 dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	if (dr->dt.dl.dr_data != db->db.db_data) {
 		struct dnode *dn = dr->dr_dnode;
 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 
 		kmem_free(dr->dt.dl.dr_data, max_bonuslen);
 		arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
 	}
 	db->db_data_pending = NULL;
 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
 	list_remove(&db->db_dirty_records, dr);
 	if (dr->dr_dbuf->db_level != 0) {
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 	ASSERT3U(db->db_dirtycnt, >, 0);
 	db->db_dirtycnt -= 1;
 }
 
 /*
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
 boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
 	boolean_t brtwrite;
 
 	ASSERT(txg != 0);
 
 	/*
 	 * Due to our use of dn_nlevels below, this can only be called
 	 * in open context, unless we are operating on the MOS.
 	 * From syncing context, dn_nlevels may be different from the
 	 * dn_nlevels used when dbuf was dirtied.
 	 */
 	ASSERT(db->db_objset ==
 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
 	if (dr == NULL)
 		return (B_FALSE);
 	ASSERT(dr->dr_dbuf == db);
 
 	brtwrite = dr->dt.dl.dr_brtwrite;
 	if (brtwrite) {
 		/*
 		 * We are freeing a block that we cloned in the same
 		 * transaction group.
 		 */
 		brt_pending_remove(dmu_objset_spa(db->db_objset),
 		    &dr->dt.dl.dr_overridden_by, tx);
 	}
 
 	dnode_t *dn = dr->dr_dnode;
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
 	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
 	    dr->dr_accounted, txg);
 
 	list_remove(&db->db_dirty_records, dr);
 
 	/*
 	 * Note that there are three places in dbuf_dirty()
 	 * where this dirty record may be put on a list.
 	 * Make sure to do a list_remove corresponding to
 	 * every one of those list_insert calls.
 	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
 	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	if (db->db_state != DB_NOFILL && !brtwrite) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
 		ASSERT(dr->dt.dl.dr_data != NULL);
 		if (dr->dt.dl.dr_data != db->db_buf)
 			arc_buf_destroy(dr->dt.dl.dr_data, db);
 	}
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		ASSERT(db->db_state == DB_NOFILL || brtwrite ||
 		    arc_released(db->db_buf));
 		dbuf_destroy(db);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static void
 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	boolean_t undirty = B_FALSE;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	/*
 	 * Quick check for dirtiness.  For already dirty blocks, this
 	 * reduces runtime of this function by >90%, and overall performance
 	 * by 50% for some workloads (e.g. file deletion with indirect blocks
 	 * cached).
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
 		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		/*
 		 * It's possible that it is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
 		if (dr != NULL) {
 			if (dr->dt.dl.dr_brtwrite) {
 				/*
 				 * Block cloning: If we are dirtying a cloned
 				 * block, we cannot simply redirty it, because
 				 * this dr has no data associated with it.
 				 * We will go through a full undirtying below,
 				 * before dirtying it again.
 				 */
 				undirty = B_TRUE;
 			} else {
 				/* This dbuf is already dirty and cached. */
 				dbuf_redirty(dr);
 				mutex_exit(&db->db_mtx);
 				return;
 			}
 		}
 	}
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		flags |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
 	 * want to make sure dbuf_read() will read the pending cloned block and
 	 * not the uderlying block that is being replaced. dbuf_undirty() will
 	 * do dbuf_unoverride(), so we will end up with cloned block content,
 	 * without overridden BP.
 	 */
 	(void) dbuf_read(db, NULL, flags);
 	if (undirty) {
 		mutex_enter(&db->db_mtx);
 		VERIFY(!dbuf_undirty(db, tx));
 		mutex_exit(&db->db_mtx);
 	}
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_will_dirty_impl(db_fake,
 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
 }
 
 boolean_t
 dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dbuf_dirty_record_t *dr;
 
 	mutex_enter(&db->db_mtx);
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 	mutex_exit(&db->db_mtx);
 	return (dr != NULL);
 }
 
 void
 dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	/*
 	 * Block cloning: We are going to clone into this block, so undirty
 	 * modifications done to this block so far in this txg. This includes
 	 * writes and clones into this block.
 	 */
 	mutex_enter(&db->db_mtx);
 	VERIFY(!dbuf_undirty(db, tx));
 	ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
 	}
 	mutex_exit(&db->db_mtx);
 
 	dmu_buf_will_not_fill(db_fake, tx);
 }
 
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db, "allocating NOFILL buffer");
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_NOFILL) {
 		/*
 		 * Block cloning: We will be completely overwriting a block
 		 * cloned in this transaction group, so let's undirty the
 		 * pending clone and mark the block as uncached. This will be
 		 * as if the clone was never done.
 		 */
 		VERIFY(!dbuf_undirty(db, tx));
 		db->db_state = DB_UNCACHED;
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 /*
  * This function is effectively the same as dmu_buf_will_dirty(), but
  * indicates the caller expects raw encrypted data in the db, and provides
  * the crypt params (byteorder, salt, iv, mac) which should be stored in the
  * blkptr_t when this dbuf is written.  This is only used for blocks of
  * dnodes, during raw receive.
  */
 void
 dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dbuf_dirty_record_t *dr;
 
 	/*
 	 * dr_has_raw_params is only processed for blocks of dnodes
 	 * (see dbuf_sync_dnode_leaf_crypt()).
 	 */
 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
 	ASSERT3U(db->db_level, ==, 0);
 	ASSERT(db->db_objset->os_raw_receive);
 
 	dmu_buf_will_dirty_impl(db_fake,
 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
 
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 
 	ASSERT3P(dr, !=, NULL);
 
 	dr->dt.dl.dr_has_raw_params = B_TRUE;
 	dr->dt.dl.dr_byteorder = byteorder;
 	memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
 	memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
 	memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 static void
 dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 
 	dr = list_head(&db->db_dirty_records);
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	dl->dr_overridden_by = *bp;
 	dl->dr_override_state = DR_OVERRIDDEN;
 	dl->dr_overridden_by.blk_birth = dr->dr_txg;
 }
 
 void
 dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
 {
 	(void) tx;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	dbuf_states_t old_state;
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	old_state = db->db_state;
 	db->db_state = DB_CACHED;
 	if (old_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			memset(db->db.db_data, 0, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
 			DTRACE_SET_STATE(db,
 			    "fill done handling freed in flight");
 		} else {
 			DTRACE_SET_STATE(db, "fill done");
 		}
 		cv_broadcast(&db->db_changed);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 	dbuf_dirty_record_t *dr;
 
 	if (etype == BP_EMBEDDED_TYPE_DATA) {
 		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
 		    SPA_FEATURE_EMBEDDED_DATA));
 	}
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	dr = list_head(&db->db_dirty_records);
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
 	BP_SET_TYPE(&dl->dr_overridden_by, type);
 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
 	dl->dr_overridden_by.blk_birth = dr->dr_txg;
 }
 
 void
 dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	dmu_object_type_t type;
 	ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	blkptr_t bp = { { { {0} } } };
 	BP_SET_TYPE(&bp, type);
 	BP_SET_LEVEL(&bp, 0);
 	BP_SET_BIRTH(&bp, tx->tx_txg, 0);
 	BP_SET_REDACTED(&bp);
 	BPE_SET_LSIZE(&bp, dbuf->db_size);
 
 	dbuf_override_impl(db, &bp, tx);
 }
 
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
  */
 void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
 	ASSERT(buf != NULL);
 	ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
 	ASSERT(tx->tx_txg != 0);
 
 	arc_return_buf(buf, db);
 	ASSERT(arc_released(buf));
 
 	mutex_enter(&db->db_mtx);
 
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
 
 	if (db->db_state == DB_CACHED &&
 	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
 		/*
 		 * In practice, we will never have a case where we have an
 		 * encrypted arc buffer while additional holds exist on the
 		 * dbuf. We don't handle this here so we simply assert that
 		 * fact instead.
 		 */
 		ASSERT(!arc_is_encrypted(buf));
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		memcpy(db->db.db_data, buf->b_data, db->db.db_size);
 		arc_buf_destroy(buf, db);
 		return;
 	}
 
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 
 		ASSERT(db->db_buf != NULL);
 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
 
 			if (!arc_released(db->db_buf)) {
 				ASSERT(dr->dt.dl.dr_override_state ==
 				    DR_OVERRIDDEN);
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
 			arc_buf_destroy(db->db_buf, db);
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
 			arc_buf_destroy(db->db_buf, db);
 		}
 		db->db_buf = NULL;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
 	db->db_state = DB_FILL;
 	DTRACE_SET_STATE(db, "filling assigned arcbuf");
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
 	dmu_buf_fill_done(&db->db, tx);
 }
 
 void
 dbuf_destroy(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
 	}
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int slots = DB_DNODE(db)->dn_num_slots;
 		int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
 		if (db->db.db_data != NULL) {
 			kmem_free(db->db.db_data, bonuslen);
 			arc_space_return(bonuslen, ARC_SPACE_BONUS);
 			db->db_state = DB_UNCACHED;
 			DTRACE_SET_STATE(db, "buffer cleared");
 		}
 	}
 
 	dbuf_clear_data(db);
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+		ASSERT0(dmu_buf_user_size(&db->db));
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
 
 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
 		} else {
 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 			DBUF_STAT_BUMPDOWN(cache_count);
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
 			    db->db.db_size);
 		}
 		db->db_caching_status = DB_NO_CACHE;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 	ASSERT(list_is_empty(&db->db_dirty_records));
 
 	db->db_state = DB_EVICTING;
 	DTRACE_SET_STATE(db, "buffer eviction started");
 	db->db_blkptr = NULL;
 
 	/*
 	 * Now that db_state is DB_EVICTING, nobody else can find this via
 	 * the hash table.  We can now drop db_mtx, which allows us to
 	 * acquire the dn_dbufs_mtx.
 	 */
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
 		if (needlock)
 			mutex_enter_nested(&dn->dn_dbufs_mtx,
 			    NESTED_SINGLE);
 		avl_remove(&dn->dn_dbufs, db);
 		membar_producer();
 		DB_DNODE_EXIT(db);
 		if (needlock)
 			mutex_exit(&dn->dn_dbufs_mtx);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
 		 * so the dnode cannot be moved until after we release the hold.
 		 * The membar_producer() ensures visibility of the decremented
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
 		mutex_enter(&dn->dn_mtx);
 		dnode_rele_and_unlock(dn, db, B_TRUE);
 		db->db_dnode_handle = NULL;
 
 		dbuf_hash_remove(db);
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 
 	db->db_parent = NULL;
 
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb) {
 		mutex_enter(&parent->db_mtx);
 		dbuf_rele_and_unlock(parent, db, B_TRUE);
 	}
 
 	kmem_cache_free(dbuf_kmem_cache, db);
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 }
 
 /*
  * Note: While bpp will always be updated if the function returns success,
  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
  * this happens when the dnode is the meta-dnode, or {user|group|project}used
  * object.
  */
 __attribute__((always_inline))
 static inline int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
 {
 	*parentp = NULL;
 	*bpp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_have_spill &&
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 			*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
 		else
 			*bpp = NULL;
 		dbuf_add_ref(dn->dn_dbuf, NULL);
 		*parentp = dn->dn_dbuf;
 		mutex_exit(&dn->dn_mtx);
 		return (0);
 	}
 
 	int nlevels =
 	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	/*
 	 * This assertion shouldn't trip as long as the max indirect block size
 	 * is less than 1M.  The reason for this is that up to that point,
 	 * the number of levels required to address an entire object with blocks
 	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.	 In
 	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
 	 * (i.e. we can address the entire object), objects will all use at most
 	 * N-1 levels and the assertion won't overflow.	 However, once epbs is
 	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
 	 * enough to address an entire object, so objects will have 5 levels,
 	 * but then this assertion will overflow.
 	 *
 	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
 	 * need to redo this logic to handle overflows.
 	 */
 	ASSERT(level >= nlevels ||
 	    ((nlevels - level - 1) * epbs) +
 	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
 	if (level >= nlevels ||
 	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
 	    ((nlevels - level - 1) * epbs)) ||
 	    (fail_sparse &&
 	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err;
 
 		err = dbuf_hold_impl(dn, level + 1,
 		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err) {
 			dbuf_rele(*parentp, NULL);
 			*parentp = NULL;
 			return (err);
 		}
 		rw_enter(&(*parentp)->db_rwlock, RW_READER);
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
 		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
 			ASSERT(BP_IS_HOLE(*bpp));
 		rw_exit(&(*parentp)->db_rwlock);
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
 		if (dn->dn_dbuf) {
 			dbuf_add_ref(dn->dn_dbuf, NULL);
 			*parentp = dn->dn_dbuf;
 		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
 }
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
 
 	list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
 	    offsetof(dbuf_dirty_record_t, dr_dbuf_node));
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
 	db->db_dirtycnt = 0;
 	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 	db->db_hash = hash;
 
 	db->db_user = NULL;
 	db->db_user_immediate_evict = FALSE;
 	db->db_freed_in_flight = FALSE;
 	db->db_pending_evict = FALSE;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "bonus buffer created");
 		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
 	 * This prevents a possible deadlock with someone
 	 * trying to look up this dbuf before it's added to the
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db->db_state = DB_EVICTING; /* not worth logging this state change */
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		mutex_exit(&dn->dn_dbufs_mtx);
 		kmem_cache_free(dbuf_kmem_cache, db);
 		DBUF_STAT_BUMP(hash_insert_race);
 		return (odb);
 	}
 	avl_add(&dn->dn_dbufs, db);
 
 	db->db_state = DB_UNCACHED;
 	DTRACE_SET_STATE(db, "regular buffer created");
 	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    zfs_refcount_count(&dn->dn_holds) > 0);
 	(void) zfs_refcount_add(&dn->dn_holds, db);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 /*
  * This function returns a block pointer and information about the object,
  * given a dnode and a block.  This is a publicly accessible version of
  * dbuf_findbp that only returns some information, rather than the
  * dbuf.  Note that the dnode passed in must be held, and the dn_struct_rwlock
  * should be locked as (at least) a reader.
  */
 int
 dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
     blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
 {
 	dmu_buf_impl_t *dbp = NULL;
 	blkptr_t *bp2;
 	int err = 0;
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
 	if (err == 0) {
 		ASSERT3P(bp2, !=, NULL);
 		*bp = *bp2;
 		if (dbp != NULL)
 			dbuf_rele(dbp, NULL);
 		if (datablkszsec != NULL)
 			*datablkszsec = dn->dn_phys->dn_datablkszsec;
 		if (indblkshift != NULL)
 			*indblkshift = dn->dn_phys->dn_indblkshift;
 	}
 
 	return (err);
 }
 
 typedef struct dbuf_prefetch_arg {
 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
 	int dpa_curlevel; /* The current level that we're reading */
 	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
 	dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
 	void *dpa_arg; /* prefetch completion arg */
 } dbuf_prefetch_arg_t;
 
 static void
 dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
 {
 	if (dpa->dpa_cb != NULL) {
 		dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
 		    dpa->dpa_zb.zb_blkid, io_done);
 	}
 	kmem_free(dpa, sizeof (*dpa));
 }
 
 static void
 dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	(void) zio, (void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
 	if (abuf != NULL)
 		arc_buf_destroy(abuf, private);
 
 	dbuf_prefetch_fini(dpa, B_TRUE);
 }
 
 /*
  * Actually issue the prefetch read for the block given.
  */
 static void
 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
 {
 	ASSERT(!BP_IS_REDACTED(bp) ||
 	    dsl_dataset_feature_is_active(
 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 		return (dbuf_prefetch_fini(dpa, B_FALSE));
 
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 	arc_flags_t aflags =
 	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_NO_BUF;
 
 	/* dnodes are always read as raw and then converted later */
 	if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
 	    dpa->dpa_curlevel == 0)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
 	ASSERT(dpa->dpa_zio != NULL);
 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
 	    dbuf_issue_final_prefetch_done, dpa,
 	    dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
 }
 
 /*
  * Called when an indirect block above our prefetch target is read in.  This
  * will either read in the next indirect block down the tree or issue the actual
  * prefetch if the next block down is our target.
  */
 static void
 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	(void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
 	ASSERT3S(dpa->dpa_curlevel, >, 0);
 
 	if (abuf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		dbuf_prefetch_fini(dpa, B_TRUE);
 		return;
 	}
 	ASSERT(zio == NULL || zio->io_error == 0);
 
 	/*
 	 * The dpa_dnode is only valid if we are called with a NULL
 	 * zio. This indicates that the arc_read() returned without
 	 * first calling zio_read() to issue a physical read. Once
 	 * a physical read is made the dpa_dnode must be invalidated
 	 * as the locks guarding it may have been dropped. If the
 	 * dpa_dnode is still valid, then we want to add it to the dbuf
 	 * cache. To do so, we must hold the dbuf associated with the block
 	 * we just prefetched, read its contents so that we associate it
 	 * with an arc_buf_t, and then release it.
 	 */
 	if (zio != NULL) {
 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
 		if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
 			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
 		} else {
 			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
 		}
 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
 
 		dpa->dpa_dnode = NULL;
 	} else if (dpa->dpa_dnode != NULL) {
 		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
 		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
 		    dpa->dpa_zb.zb_level));
 		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
 		    dpa->dpa_curlevel, curblkid, FTAG);
 		if (db == NULL) {
 			arc_buf_destroy(abuf, private);
 			dbuf_prefetch_fini(dpa, B_TRUE);
 			return;
 		}
 		(void) dbuf_read(db, NULL,
 		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
 		dbuf_rele(db, FTAG);
 	}
 
 	dpa->dpa_curlevel--;
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 
 	ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
 	    dsl_dataset_feature_is_active(
 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS)));
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
 		arc_buf_destroy(abuf, private);
 		dbuf_prefetch_fini(dpa, B_TRUE);
 		return;
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
 		dbuf_issue_final_prefetch(dpa, bp);
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 
 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
 
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    bp, dbuf_prefetch_indirect_done, dpa,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 
 	arc_buf_destroy(abuf, private);
 }
 
 /*
  * Issue prefetch reads for the given block on the given level.  If the indirect
  * blocks above that block are not in memory, we will read them in
  * asynchronously.  As a result, this call never blocks waiting for a read to
  * complete. Note that the prefetch might fail if the dataset is encrypted and
  * the encryption key is unmapped before the IO completes.
  */
 int
 dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
     zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
     void *arg)
 {
 	blkptr_t bp;
 	int epbs, nlevels, curlevel;
 	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	if (blkid > dn->dn_maxblkid)
 		goto no_issue;
 
 	if (level == 0 && dnode_block_freed(dn, blkid))
 		goto no_issue;
 
 	/*
 	 * This dnode hasn't been written to disk yet, so there's nothing to
 	 * prefetch.
 	 */
 	nlevels = dn->dn_phys->dn_nlevels;
 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
 		goto no_issue;
 
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
 		goto no_issue;
 
 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
 	    level, blkid, NULL);
 	if (db != NULL) {
 		mutex_exit(&db->db_mtx);
 		/*
 		 * This dbuf already exists.  It is either CACHED, or
 		 * (we assume) about to be read or filled.
 		 */
 		goto no_issue;
 	}
 
 	/*
 	 * Find the closest ancestor (indirect block) of the target block
 	 * that is present in the cache.  In this indirect block, we will
 	 * find the bp that is at curlevel, curblkid.
 	 */
 	curlevel = level;
 	curblkid = blkid;
 	while (curlevel < nlevels - 1) {
 		int parent_level = curlevel + 1;
 		uint64_t parent_blkid = curblkid >> epbs;
 		dmu_buf_impl_t *db;
 
 		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
 		    FALSE, TRUE, FTAG, &db) == 0) {
 			blkptr_t *bpp = db->db_buf->b_data;
 			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
 			dbuf_rele(db, FTAG);
 			break;
 		}
 
 		curlevel = parent_level;
 		curblkid = parent_blkid;
 	}
 
 	if (curlevel == nlevels - 1) {
 		/* No cached indirect blocks found. */
 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
 		bp = dn->dn_phys->dn_blkptr[curblkid];
 	}
 	ASSERT(!BP_IS_REDACTED(&bp) ||
 	    dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 	if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
 		goto no_issue;
 
 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
 
 	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 	    dn->dn_object, level, blkid);
 	dpa->dpa_curlevel = curlevel;
 	dpa->dpa_prio = prio;
 	dpa->dpa_aflags = aflags;
 	dpa->dpa_spa = dn->dn_objset->os_spa;
 	dpa->dpa_dnode = dn;
 	dpa->dpa_epbs = epbs;
 	dpa->dpa_zio = pio;
 	dpa->dpa_cb = cb;
 	dpa->dpa_arg = arg;
 
 	if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
 	else if (dnode_level_is_l2cacheable(&bp, dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
 	/*
 	 * If we have the indirect just above us, no need to do the asynchronous
 	 * prefetch chain; we'll just run the last step ourselves.  If we're at
 	 * a higher level, though, we want to issue the prefetches for all the
 	 * indirect blocks asynchronously, so we can go on with whatever we were
 	 * doing.
 	 */
 	if (curlevel == level) {
 		ASSERT3U(curblkid, ==, blkid);
 		dbuf_issue_final_prefetch(dpa, &bp);
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (dnode_level_is_l2cacheable(&bp, dn, level))
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 		    dn->dn_object, curlevel, curblkid);
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    &bp, dbuf_prefetch_indirect_done, dpa,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 	/*
 	 * We use pio here instead of dpa_zio since it's possible that
 	 * dpa may have already been freed.
 	 */
 	zio_nowait(pio);
 	return (1);
 no_issue:
 	if (cb != NULL)
 		cb(arg, level, blkid, B_FALSE);
 	return (0);
 }
 
 int
 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
     arc_flags_t aflags)
 {
 
 	return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
 }
 
 /*
  * Helper function for dbuf_hold_impl() to copy a buffer. Handles
  * the case of encrypted, compressed and uncompressed buffers by
  * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
  * arc_alloc_compressed_buf() or arc_alloc_buf().*
  *
  * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
  */
 noinline static void
 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	dbuf_dirty_record_t *dr = db->db_data_pending;
 	arc_buf_t *data = dr->dt.dl.dr_data;
 	enum zio_compress compress_type = arc_get_compression(data);
 	uint8_t complevel = arc_get_complevel(data);
 
 	if (arc_is_encrypted(data)) {
 		boolean_t byteorder;
 		uint8_t salt[ZIO_DATA_SALT_LEN];
 		uint8_t iv[ZIO_DATA_IV_LEN];
 		uint8_t mac[ZIO_DATA_MAC_LEN];
 
 		arc_get_raw_params(data, &byteorder, salt, iv, mac);
 		dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
 		    dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
 		    dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
 		    compress_type, complevel));
 	} else if (compress_type != ZIO_COMPRESS_OFF) {
 		dbuf_set_data(db, arc_alloc_compressed_buf(
 		    dn->dn_objset->os_spa, db, arc_buf_size(data),
 		    arc_buf_lsize(data), compress_type, complevel));
 	} else {
 		dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
 		    DBUF_GET_BUFC_TYPE(db), db->db.db_size));
 	}
 
 	rw_enter(&db->db_rwlock, RW_WRITER);
 	memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
 	rw_exit(&db->db_rwlock);
 }
 
 /*
  * Returns with db_holds incremented, and db_mtx not held.
  * Note: dn_struct_rwlock must be held.
  */
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
     const void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 	uint64_t hv;
 
 	/* If the pool has been created, verify the tx_sync_lock is not held */
 	spa_t *spa = dn->dn_objset->os_spa;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	if (dp != NULL) {
 		ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
 	}
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
 		if (fail_uncached)
 			return (SET_ERROR(ENOENT));
 
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
 		if (err && err != ENOENT)
 			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp, hv);
 	}
 
 	if (fail_uncached && db->db_state != DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (db->db_buf != NULL) {
 		arc_buf_access(db->db_buf);
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
 	}
 
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
 	 * If this buffer is currently syncing out, and we are
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
 		if (dr->dt.dl.dr_data == db->db_buf) {
 			ASSERT3P(db->db_buf, !=, NULL);
 			dbuf_hold_copy(dn, db);
 		}
 	}
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(zfs_refcount_is_zero(&db->db_holds));
 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+		uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
 		(void) zfs_refcount_remove_many(
-		    &dbuf_caches[db->db_caching_status].size,
-		    db->db.db_size, db);
+		    &dbuf_caches[db->db_caching_status].size, size, db);
 
 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
 		} else {
 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 			DBUF_STAT_BUMPDOWN(cache_count);
-			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-			    db->db.db_size);
+			DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
 		}
 		db->db_caching_status = DB_NO_CACHE;
 	}
 	(void) zfs_refcount_add(&db->db_holds, tag);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
 
 	return (0);
 }
 
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
 {
 	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 void
 dbuf_create_bonus(dnode_t *dn)
 {
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
 	    dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
 }
 
 int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	dbuf_new_size(db, blksz, tx);
 
 	return (0);
 }
 
 void
 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
 {
 	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
 	VERIFY3S(holds, >, 1);
 }
 
 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
 boolean_t
 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
     const void *tag)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_buf_impl_t *found_db;
 	boolean_t result = B_FALSE;
 
 	if (blkid == DMU_BONUS_BLKID)
 		found_db = dbuf_find_bonus(os, obj);
 	else
 		found_db = dbuf_find(os, obj, 0, blkid, NULL);
 
 	if (found_db != NULL) {
 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
 			(void) zfs_refcount_add(&db->db_holds, tag);
 			result = B_TRUE;
 		}
 		mutex_exit(&found_db->db_mtx);
 	}
 	return (result);
 }
 
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
 dbuf_rele(dmu_buf_impl_t *db, const void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
 dmu_buf_rele(dmu_buf_t *db, const void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
  * argument should be set if we are already in the dbuf-evicting code
  * path, in which case we don't want to recursively evict.  This allows us to
  * avoid deeply nested stacks that would have a call flow similar to this:
  *
  * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
  *	^						|
  *	|						|
  *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
  *
  */
 void
 dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
 {
 	int64_t holds;
 	uint64_t size;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	/*
 	 * Remove the reference to the dbuf before removing its hold on the
 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
 	 * buffer has a corresponding dnode hold.
 	 */
 	holds = zfs_refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
 	/*
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
 	if (db->db_buf != NULL &&
 	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
 		arc_buf_freeze(db->db_buf);
 	}
 
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_user_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			dnode_t *dn;
 			boolean_t evict_dbuf = db->db_pending_evict;
 
 			/*
 			 * If the dnode moves here, we cannot cross this
 			 * barrier until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
 
 			dn = DB_DNODE(db);
 			atomic_dec_32(&dn->dn_dbufs_count);
 
 			/*
 			 * Decrementing the dbuf count means that the bonus
 			 * buffer's dnode hold is no longer discounted in
 			 * dnode_move(). The dnode cannot move until after
 			 * the dnode_rele() below.
 			 */
 			DB_DNODE_EXIT(db);
 
 			/*
 			 * Do not reference db after its lock is dropped.
 			 * Another thread may evict it.
 			 */
 			mutex_exit(&db->db_mtx);
 
 			if (evict_dbuf)
 				dnode_evict_bonus(dn);
 
 			dnode_rele(dn, db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
 			dbuf_destroy(db);
 		} else if (arc_released(db->db_buf)) {
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_destroy(db);
 		} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
 		    db->db_pending_evict) {
 			dbuf_destroy(db);
 		} else if (!multilist_link_active(&db->db_cache_link)) {
 			ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 
 			dbuf_cached_state_t dcs =
 			    dbuf_include_in_metadata_cache(db) ?
 			    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
 			db->db_caching_status = dcs;
 
 			multilist_insert(&dbuf_caches[dcs].cache, db);
-			uint64_t db_size = db->db.db_size;
+			uint64_t db_size = db->db.db_size +
+			    dmu_buf_user_size(&db->db);
 			size = zfs_refcount_add_many(
 			    &dbuf_caches[dcs].size, db_size, db);
 			uint8_t db_level = db->db_level;
 			mutex_exit(&db->db_mtx);
 
 			if (dcs == DB_DBUF_METADATA_CACHE) {
 				DBUF_STAT_BUMP(metadata_cache_count);
 				DBUF_STAT_MAX(metadata_cache_size_bytes_max,
 				    size);
 			} else {
 				DBUF_STAT_BUMP(cache_count);
 				DBUF_STAT_MAX(cache_size_bytes_max, size);
 				DBUF_STAT_BUMP(cache_levels[db_level]);
 				DBUF_STAT_INCR(cache_levels_bytes[db_level],
 				    db_size);
 			}
 
 			if (dcs == DB_DBUF_CACHE && !evicting)
 				dbuf_evict_notify(size);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
 	return (zfs_refcount_count(&db->db_holds));
 }
 
 uint64_t
 dmu_buf_user_refcount(dmu_buf_t *db_fake)
 {
 	uint64_t holds;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
 	holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
 	mutex_exit(&db->db_mtx);
 
 	return (holds);
 }
 
 void *
 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
     dmu_buf_user_t *new_user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	if (db->db_user == old_user)
 		db->db_user = new_user;
 	else
 		old_user = db->db_user;
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	mutex_exit(&db->db_mtx);
 
 	return (old_user);
 }
 
 void *
 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_user_immediate_evict = TRUE;
 	return (dmu_buf_set_user(db_fake, user));
 }
 
 void *
 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	return (db->db_user);
 }
 
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	if (db->db_user == NULL)
+		return (0);
+	return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+	ASSERT3P(db->db_user, !=, NULL);
+	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+	atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
+void
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+	ASSERT3P(db->db_user, !=, NULL);
+	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+	atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
 void
 dmu_buf_user_evict_wait(void)
 {
 	taskq_wait(dbu_evict_taskq);
 }
 
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_blkptr);
 }
 
 objset_t *
 dmu_buf_get_objset(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_objset);
 }
 
 dnode_t *
 dmu_buf_dnode_enter(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	DB_DNODE_ENTER(dbi);
 	return (DB_DNODE(dbi));
 }
 
 void
 dmu_buf_dnode_exit(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	DB_DNODE_EXIT(dbi);
 }
 
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	/* ASSERT(dmu_tx_is_syncing(tx) */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_blkptr != NULL)
 		return;
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
 		BP_ZERO(db->db_blkptr);
 		return;
 	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
 		 * no available blkptrs from the dnode, or it was
 		 * inappropriate to hook it in (i.e., nlevels mismatch).
 		 */
 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
 		ASSERT(db->db_parent == NULL);
 		db->db_parent = dn->dn_dbuf;
 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
 		DBUF_VERIFY(db);
 	} else {
 		dmu_buf_impl_t *parent = db->db_parent;
 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 		ASSERT(dn->dn_phys->dn_nlevels > 1);
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
 		}
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 	}
 }
 
 static void
 dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	void *data = dr->dt.dl.dr_data;
 
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_blkid == DMU_BONUS_BLKID);
 	ASSERT(data != NULL);
 
 	dnode_t *dn = dr->dr_dnode;
 	ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
 	    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
 	memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
 
 	dbuf_sync_leaf_verify_bonus_dnode(dr);
 
 	dbuf_undirty_bonus(dr);
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 }
 
 /*
  * When syncing out a blocks of dnodes, adjust the block to deal with
  * encryption.  Normally, we make sure the block is decrypted before writing
  * it.  If we have crypt params, then we are writing a raw (encrypted) block,
  * from a raw receive.  In this case, set the ARC buf's crypt params so
  * that the BP will be filled with the correct byteorder, salt, iv, and mac.
  */
 static void
 dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
 {
 	int err;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
 	ASSERT3U(db->db_level, ==, 0);
 
 	if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
 		zbookmark_phys_t zb;
 
 		/*
 		 * Unfortunately, there is currently no mechanism for
 		 * syncing context to handle decryption errors. An error
 		 * here is only possible if an attacker maliciously
 		 * changed a dnode block and updated the associated
 		 * checksums going up the block tree.
 		 */
 		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 		    db->db.db_object, db->db_level, db->db_blkid);
 		err = arc_untransform(db->db_buf, db->db_objset->os_spa,
 		    &zb, B_TRUE);
 		if (err)
 			panic("Invalid dnode block MAC");
 	} else if (dr->dt.dl.dr_has_raw_params) {
 		(void) arc_release(dr->dt.dl.dr_data, db);
 		arc_convert_to_raw(dr->dt.dl.dr_data,
 		    dmu_objset_id(db->db_objset),
 		    dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
 		    dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
 	}
 }
 
 /*
  * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
  * is critical the we not allow the compiler to inline this function in to
  * dbuf_sync_list() thereby drastically bloating the stack usage.
  */
 noinline static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
 	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 	ASSERT(db->db_buf != NULL);
 
 	/* Indirect block size must match what the dnode thinks it is. */
 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
 
 	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio_t *zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
 	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
 }
 
 /*
  * Verify that the size of the data in our bonus buffer does not exceed
  * its recorded size.
  *
  * The purpose of this verification is to catch any cases in development
  * where the size of a phys structure (i.e space_map_phys_t) grows and,
  * due to incorrect feature management, older pools expect to read more
  * data even though they didn't actually write it to begin with.
  *
  * For a example, this would catch an error in the feature logic where we
  * open an older pool and we expect to write the space map histogram of
  * a space map with size SPACE_MAP_SIZE_V0.
  */
 static void
 dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
 {
 #ifdef ZFS_DEBUG
 	dnode_t *dn = dr->dr_dnode;
 
 	/*
 	 * Encrypted bonus buffers can have data past their bonuslen.
 	 * Skip the verification of these blocks.
 	 */
 	if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
 		return;
 
 	uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
 	uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 	ASSERT3U(bonuslen, <=, maxbonuslen);
 
 	arc_buf_t *datap = dr->dt.dl.dr_data;
 	char *datap_end = ((char *)datap) + bonuslen;
 	char *datap_max = ((char *)datap) + maxbonuslen;
 
 	/* ensure that everything is zero after our data */
 	for (; datap_end < datap_max; datap_end++)
 		ASSERT(*datap_end == 0);
 #endif
 }
 
 static blkptr_t *
 dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
 {
 	/* This must be a lightweight dirty record. */
 	ASSERT3P(dr->dr_dbuf, ==, NULL);
 	dnode_t *dn = dr->dr_dnode;
 
 	if (dn->dn_phys->dn_nlevels == 1) {
 		VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
 		return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
 	} else {
 		dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 		VERIFY3U(parent_db->db_level, ==, 1);
 		VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
 		VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
 		blkptr_t *bp = parent_db->db.db_data;
 		return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
 	}
 }
 
 static void
 dbuf_lightweight_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error != 0)
 		return;
 
 	dnode_t *dn = dr->dr_dnode;
 
 	blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	int64_t delta = bp_get_dsize_sync(spa, bp) -
 	    bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta);
 
 	uint64_t blkid = dr->dt.dll.dr_blkid;
 	mutex_enter(&dn->dn_mtx);
 	if (blkid > dn->dn_phys->dn_maxblkid) {
 		ASSERT0(dn->dn_objset->os_raw_receive);
 		dn->dn_phys->dn_maxblkid = blkid;
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
 		BP_SET_FILL(bp, fill);
 	}
 
 	dmu_buf_impl_t *parent_db;
 	EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
 	if (dr->dr_parent == NULL) {
 		parent_db = dn->dn_dbuf;
 	} else {
 		parent_db = dr->dr_parent->dr_dbuf;
 	}
 	rw_enter(&parent_db->db_rwlock, RW_WRITER);
 	*bp_orig = *bp;
 	rw_exit(&parent_db->db_rwlock);
 }
 
 static void
 dbuf_lightweight_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 
 	VERIFY0(zio->io_error);
 
 	objset_t *os = dr->dr_dnode->dn_objset;
 	dmu_tx_t *tx = os->os_synctx;
 
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, zio->io_bp, tx);
 	}
 
 	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
 	    zio->io_txg);
 
 	abd_free(dr->dt.dll.dr_abd);
 	kmem_free(dr, sizeof (*dr));
 }
 
 noinline static void
 dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dnode_t *dn = dr->dr_dnode;
 	zio_t *pio;
 	if (dn->dn_phys->dn_nlevels == 1) {
 		pio = dn->dn_zio;
 	} else {
 		pio = dr->dr_parent->dr_zio;
 	}
 
 	zbookmark_phys_t zb = {
 		.zb_objset = dmu_objset_id(dn->dn_objset),
 		.zb_object = dn->dn_object,
 		.zb_level = 0,
 		.zb_blkid = dr->dt.dll.dr_blkid,
 	};
 
 	/*
 	 * See comment in dbuf_write().  This is so that zio->io_bp_orig
 	 * will have the old BP in dbuf_lightweight_done().
 	 */
 	dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
 
 	dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
 	    dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
 	    dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
 	    &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
 	    dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
 
 	zio_nowait(dr->dr_zio);
 }
 
 /*
  * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
  * critical the we not allow the compiler to inline this function in to
  * dbuf_sync_list() thereby drastically bloating the stack usage.
  */
 noinline static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * To be synced, we must be dirtied.  But we
 	 * might have been freed after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
 		ASSERT(db->db.db_data == NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else if (db->db_state == DB_READ) {
 		/*
 		 * This buffer has a clone we need to write, and an in-flight
 		 * read on the BP we're about to clone. Its safe to issue the
 		 * write here because the read has already been issued and the
 		 * contents won't change.
 		 */
 		ASSERT(dr->dt.dl.dr_brtwrite &&
 		    dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 			/*
 			 * In the previous transaction group, the bonus buffer
 			 * was entirely used to store the attributes for the
 			 * dnode which overrode the dn_spill field.  However,
 			 * when adding more attributes to the file a spill
 			 * block was required to hold the extra attributes.
 			 *
 			 * Make sure to clear the garbage left in the dn_spill
 			 * field from the previous attributes in the bonus
 			 * buffer.  Otherwise, after writing out the spill
 			 * block to the new allocated dva, it will free
 			 * the old block pointed to by the invalid dn_spill.
 			 */
 			db->db_blkptr = NULL;
 		}
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dr->dr_dbuf == db);
 		dbuf_sync_bonus(dr, tx);
 		return;
 	}
 
 	os = dn->dn_objset;
 
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
 	 * don't check the dr_override_state until we have returned from
 	 * dbuf_check_blkptr.
 	 */
 	dbuf_check_blkptr(dn, db);
 
 	/*
 	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
 	}
 
 	/*
 	 * If this is a dnode block, ensure it is appropriately encrypted
 	 * or decrypted, depending on what we are writing to it this txg.
 	 */
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);
 
 	if (db->db_state != DB_NOFILL &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1 &&
 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
 	    *datap == db->db_buf) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
 		 * then make a copy before we start the write so that
 		 * any modifications from the open txg will not leak
 		 * into this write.
 		 *
 		 * NOTE: this copy does not need to be made for
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
 		int psize = arc_buf_size(*datap);
 		int lsize = arc_buf_lsize(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		enum zio_compress compress_type = arc_get_compression(*datap);
 		uint8_t complevel = arc_get_complevel(*datap);
 
 		if (arc_is_encrypted(*datap)) {
 			boolean_t byteorder;
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t mac[ZIO_DATA_MAC_LEN];
 
 			arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
 			*datap = arc_alloc_raw_buf(os->os_spa, db,
 			    dmu_objset_id(os), byteorder, salt, iv, mac,
 			    dn->dn_type, psize, lsize, compress_type,
 			    complevel);
 		} else if (compress_type != ZIO_COMPRESS_OFF) {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			*datap = arc_alloc_compressed_buf(os->os_spa, db,
 			    psize, lsize, compress_type, complevel);
 		} else {
 			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
 		}
 		memcpy((*datap)->b_data, db->db.db_data, psize);
 	}
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 	} else {
 		zio_nowait(dr->dr_zio);
 	}
 }
 
 /*
  * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
  * called recursively from dbuf_sync_indirect().
  */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
 	while ((dr = list_head(list))) {
 		if (dr->dr_zio != NULL) {
 			/*
 			 * If we find an already initialized zio then we
 			 * are processing the meta-dnode, and we have finished.
 			 * The dbufs for all dnodes are put back on the list
 			 * during processing, so that we can zio_wait()
 			 * these IOs after initiating all child IOs.
 			 */
 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf == NULL) {
 			dbuf_sync_lightweight(dr, tx);
 		} else {
 			if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 				VERIFY3U(dr->dr_dbuf->db_level, ==, level);
 			}
 			if (dr->dr_dbuf->db_level > 0)
 				dbuf_sync_indirect(dr, tx);
 			else
 				dbuf_sync_leaf(dr, tx);
 		}
 	}
 }
 
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
 	ASSERT3P(db->db_blkptr, !=, NULL);
 	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
 	if (bp->blk_birth != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
 		    BP_IS_EMBEDDED(bp));
 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(bp)) &&
 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
 	}
 #endif
 
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			ASSERT0(db->db_objset->os_raw_receive);
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		}
 		mutex_exit(&dn->dn_mtx);
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			i = 0;
 			while (i < db->db.db_size) {
 				dnode_phys_t *dnp =
 				    (void *)(((char *)db->db.db_data) + i);
 
 				i += DNODE_MIN_SIZE;
 				if (dnp->dn_type != DMU_OT_NONE) {
 					fill++;
 					for (int j = 0; j < dnp->dn_nblkptr;
 					    j++) {
 						(void) zfs_blkptr_verify(spa,
 						    &dnp->dn_blkptr[j],
 						    BLK_CONFIG_SKIP,
 						    BLK_VERIFY_HALT);
 					}
 					if (dnp->dn_flags &
 					    DNODE_FLAG_SPILL_BLKPTR) {
 						(void) zfs_blkptr_verify(spa,
 						    DN_SPILL_BLKPTR(dnp),
 						    BLK_CONFIG_SKIP,
 						    BLK_VERIFY_HALT);
 					}
 					i += dnp->dn_extra_slots *
 					    DNODE_MIN_SIZE;
 				}
 			}
 		} else {
 			if (BP_IS_HOLE(bp)) {
 				fill = 0;
 			} else {
 				fill = 1;
 			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
 			(void) zfs_blkptr_verify(spa, ibp,
 			    BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
 			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
 	if (!BP_IS_EMBEDDED(bp))
 		BP_SET_FILL(bp, fill);
 
 	mutex_exit(&db->db_mtx);
 
 	db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
 	*db->db_blkptr = *bp;
 	dmu_buf_unlock_parent(db, dblt, FTAG);
 }
 
 /*
  * This function gets called just prior to running through the compression
  * stage of the zio pipeline. If we're an indirect block comprised of only
  * holes, then we want this indirect to be compressed away to a hole. In
  * order to do that we must zero out any information about the holes that
  * this indirect points to prior to before we try to compress it.
  */
 static void
 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) zio, (void) buf;
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp;
 	unsigned int epbs, i;
 
 	ASSERT3U(db->db_level, >, 0);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	ASSERT3U(epbs, <, 31);
 
 	/* Determine if all our children are holes */
 	for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
 		if (!BP_IS_HOLE(bp))
 			break;
 	}
 
 	/*
 	 * If all the children are holes, then zero them all out so that
 	 * we may get compressed away.
 	 */
 	if (i == 1ULL << epbs) {
 		/*
 		 * We only found holes. Grab the rwlock to prevent
 		 * anybody from reading the blocks we're about to
 		 * zero out.
 		 */
 		rw_enter(&db->db_rwlock, RW_WRITER);
 		memset(db->db.db_data, 0, db->db.db_size);
 		rw_exit(&db->db_rwlock);
 	}
 	DB_DNODE_EXIT(db);
 }
 
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
 	objset_t *os = db->db_objset;
 	dmu_tx_t *tx = os->os_synctx;
 
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
 	/*
 	 * For nopwrites and rewrites we ensure that the bp matches our
 	 * original and bypass all the accounting.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 	DBUF_VERIFY(db);
 
 	dbuf_dirty_record_t *dr = db->db_data_pending;
 	dnode_t *dn = dr->dr_dnode;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
 	list_remove(&db->db_dirty_records, dr);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
 	}
 #endif
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != NULL &&
 			    dr->dt.dl.dr_data != db->db_buf) {
 				arc_buf_destroy(dr->dt.dl.dr_data, db);
 			}
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
 			    SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_blkid, <=,
 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
 		}
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 
 	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
 	    zio->io_txg);
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)
 {
 	dbuf_write_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_nofill_done(zio_t *zio)
 {
 	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_override_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	dbuf_write_ready(zio, NULL, db);
 }
 
 static void
 dbuf_write_override_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
 
 	mutex_enter(&db->db_mtx);
 	if (!BP_EQUAL(zio->io_bp, obp)) {
 		if (!BP_IS_HOLE(obp))
 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write_done(zio, NULL, db);
 
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 typedef struct dbuf_remap_impl_callback_arg {
 	objset_t	*drica_os;
 	uint64_t	drica_blk_birth;
 	dmu_tx_t	*drica_tx;
 } dbuf_remap_impl_callback_arg_t;
 
 static void
 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg)
 {
 	dbuf_remap_impl_callback_arg_t *drica = arg;
 	objset_t *os = drica->drica_os;
 	spa_t *spa = dmu_objset_spa(os);
 	dmu_tx_t *tx = drica->drica_tx;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (os == spa_meta_objset(spa)) {
 		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
 	} else {
 		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
 		    size, drica->drica_blk_birth, tx);
 	}
 }
 
 static void
 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 {
 	blkptr_t bp_copy = *bp;
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	dbuf_remap_impl_callback_arg_t drica;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
 	drica.drica_blk_birth = bp->blk_birth;
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
 		/*
 		 * If the blkptr being remapped is tracked by a livelist,
 		 * then we need to make sure the livelist reflects the update.
 		 * First, cancel out the old blkptr by appending a 'FREE'
 		 * entry. Next, add an 'ALLOC' to track the new version. This
 		 * way we avoid trying to free an inaccurate blkptr at delete.
 		 * Note that embedded blkptrs are not tracked in livelists.
 		 */
 		if (dn->dn_objset != spa_meta_objset(spa)) {
 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
 			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 				ASSERT(!BP_IS_EMBEDDED(bp));
 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
 				ASSERT(spa_feature_is_enabled(spa,
 				    SPA_FEATURE_LIVELIST));
 				bplist_append(&ds->ds_dir->dd_pending_frees,
 				    bp);
 				bplist_append(&ds->ds_dir->dd_pending_allocs,
 				    &bp_copy);
 			}
 		}
 
 		/*
 		 * The db_rwlock prevents dbuf_read_impl() from
 		 * dereferencing the BP while we are changing it.  To
 		 * avoid lock contention, only grab it when we are actually
 		 * changing the BP.
 		 */
 		if (rw != NULL)
 			rw_enter(rw, RW_WRITER);
 		*bp = bp_copy;
 		if (rw != NULL)
 			rw_exit(rw);
 	}
 }
 
 /*
  * Remap any existing BP's to concrete vdevs, if possible.
  */
 static void
 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(db->db_objset);
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return;
 
 	if (db->db_level > 0) {
 		blkptr_t *bp = db->db.db_data;
 		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
 			dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
 		}
 	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dnode_phys_t *dnp = db->db.db_data;
 		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
 		    DMU_OT_DNODE);
 		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
 		    i += dnp[i].dn_extra_slots + 1) {
 			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
 				krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
 				    &dn->dn_dbuf->db_rwlock);
 				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
 				    tx);
 			}
 		}
 	}
 }
 
 
 /*
  * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
  * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
  */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *pio; /* parent I/O */
 	int wp_flag = 0;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	os = dn->dn_objset;
 
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
 			 * in the syncing context and we don't want the
 			 * overhead of making multiple copies of the data.
 			 */
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
 				dbuf_release_bp(db);
 			}
 			dbuf_remap(dn, db, tx);
 		}
 	}
 
 	if (parent != dn->dn_dbuf) {
 		/* Our parent is an indirect block. */
 		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
 		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
 		/*
 		 * We're about to modify our parent's db_data by modifying
 		 * our block pointer, so the parent must be released.
 		 */
 		ASSERT(arc_released(parent->db_buf));
 		pio = parent->db_data_pending->dr_zio;
 	} else {
 		/* Our parent is the dnode itself. */
 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
 		    db->db_blkid != DMU_SPILL_BLKID) ||
 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
 		if (db->db_blkid != DMU_SPILL_BLKID)
 			ASSERT3P(db->db_blkptr, ==,
 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		pio = dn->dn_zio;
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
 	ASSERT(pio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
 	/*
 	 * We copy the blkptr now (rather than when we instantiate the dirty
 	 * record), because its value can change between open context and
 	 * syncing context. We do not need to hold dn_struct_rwlock to read
 	 * db_blkptr because we are in syncing context.
 	 */
 	dr->dr_bp_copy = *db->db_blkptr;
 
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync() or dmu_buf_write_embedded()).
 		 */
 		abd_t *contents = (data != NULL) ?
 		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
 
 		dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
 		    contents, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL,
 		    dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
 		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL,
 		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 
 		/*
 		 * For indirect blocks, we want to setup the children
 		 * ready callback so that we can properly handle an indirect
 		 * block that only contains holes.
 		 */
 		arc_write_done_func_t *children_ready_cb = NULL;
 		if (db->db_level != 0)
 			children_ready_cb = dbuf_write_children_ready;
 
 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
 		    dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
 		    children_ready_cb, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
 
 EXPORT_SYMBOL(dbuf_find);
 EXPORT_SYMBOL(dbuf_is_metadata);
 EXPORT_SYMBOL(dbuf_destroy);
 EXPORT_SYMBOL(dbuf_loan_arcbuf);
 EXPORT_SYMBOL(dbuf_whichblock);
 EXPORT_SYMBOL(dbuf_read);
 EXPORT_SYMBOL(dbuf_unoverride);
 EXPORT_SYMBOL(dbuf_free_range);
 EXPORT_SYMBOL(dbuf_new_size);
 EXPORT_SYMBOL(dbuf_release_bp);
 EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
 EXPORT_SYMBOL(dmu_buf_will_clone);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);
 EXPORT_SYMBOL(dmu_buf_rele);
 EXPORT_SYMBOL(dbuf_assign_arcbuf);
 EXPORT_SYMBOL(dbuf_prefetch);
 EXPORT_SYMBOL(dbuf_hold_impl);
 EXPORT_SYMBOL(dbuf_hold);
 EXPORT_SYMBOL(dbuf_hold_level);
 EXPORT_SYMBOL(dbuf_create_bonus);
 EXPORT_SYMBOL(dbuf_spill_set_blksz);
 EXPORT_SYMBOL(dbuf_rm_spill);
 EXPORT_SYMBOL(dbuf_add_ref);
 EXPORT_SYMBOL(dbuf_rele);
 EXPORT_SYMBOL(dbuf_rele_and_unlock);
 EXPORT_SYMBOL(dbuf_refcount);
 EXPORT_SYMBOL(dbuf_sync_list);
 EXPORT_SYMBOL(dmu_buf_set_user);
 EXPORT_SYMBOL(dmu_buf_set_user_ie);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_get_blkptr);
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
 	"Maximum size in bytes of the dbuf cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
 	"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
 	"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
 	"Maximum size in bytes of dbuf metadata cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
 	"Set size of dbuf cache to log2 fraction of arc size.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
 	"Set size of dbuf metadata cache to log2 fraction of arc size.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
 	"Set size of dbuf cache mutex array as log2 shift.");
diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
index e5dc2df30774..ccee8997e10e 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf_stats.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
@@ -1,230 +1,231 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 
 /*
  * Calculate the index of the arc header for the state, disabled by default.
  */
 int zfs_dbuf_state_index = 0;
 
 /*
  * ==========================================================================
  * Dbuf Hash Read Routines
  * ==========================================================================
  */
 typedef struct dbuf_stats_t {
 	kmutex_t		lock;
 	kstat_t			*kstat;
 	dbuf_hash_table_t	*hash;
 	int			idx;
 } dbuf_stats_t;
 
 static dbuf_stats_t dbuf_stats_hash_table;
 
 static int
 dbuf_stats_hash_table_headers(char *buf, size_t size)
 {
 	(void) snprintf(buf, size,
-	    "%-96s | %-119s | %s\n"
-	    "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
+	    "%-105s | %-119s | %s\n"
+	    "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
 	    "%-5s %-5s %-9s %-6s %-8s %-12s "
 	    "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
 	    "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
 	    "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
-	    "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
-	    "list", "atype", "flags", "count", "asize", "access",
+	    "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
+	    "dbc", "list", "atype", "flags", "count", "asize", "access",
 	    "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
 	    "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
 	    "bsize", "lvls", "dholds", "blocks", "dsize");
 
 	return (0);
 }
 
 static int
 __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
 {
 	arc_buf_info_t abi = { 0 };
 	dmu_object_info_t doi = { 0 };
 	dnode_t *dn = DB_DNODE(db);
 	size_t nwritten;
 
 	if (db->db_buf)
 		arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
 
 	__dmu_object_info_from_dnode(dn, &doi);
 
 	nwritten = snprintf(buf, size,
-	    "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
-	    "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
+	    "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
+	    "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
 	    "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
 	    "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
 	    /* dmu_buf_impl_t */
 	    spa_name(dn->dn_objset->os_spa),
 	    (u_longlong_t)dmu_objset_id(db->db_objset),
 	    (longlong_t)db->db.db_object,
 	    (longlong_t)db->db_level,
 	    (longlong_t)db->db_blkid,
 	    (u_longlong_t)db->db.db_offset,
 	    (u_longlong_t)db->db.db_size,
+	    (u_longlong_t)dmu_buf_user_size(&db->db),
 	    !!dbuf_is_metadata(db),
 	    db->db_state,
 	    (ulong_t)zfs_refcount_count(&db->db_holds),
 	    multilist_link_active(&db->db_cache_link),
 	    /* arc_buf_info_t */
 	    abi.abi_state_type,
 	    abi.abi_state_contents,
 	    abi.abi_flags,
 	    (ulong_t)abi.abi_bufcnt,
 	    (u_longlong_t)abi.abi_size,
 	    (u_longlong_t)abi.abi_access,
 	    (ulong_t)abi.abi_mru_hits,
 	    (ulong_t)abi.abi_mru_ghost_hits,
 	    (ulong_t)abi.abi_mfu_hits,
 	    (ulong_t)abi.abi_mfu_ghost_hits,
 	    (ulong_t)abi.abi_l2arc_hits,
 	    (u_longlong_t)abi.abi_l2arc_dattr,
 	    (u_longlong_t)abi.abi_l2arc_asize,
 	    abi.abi_l2arc_compress,
 	    (ulong_t)abi.abi_holds,
 	    /* dmu_object_info_t */
 	    doi.doi_type,
 	    doi.doi_bonus_type,
 	    (ulong_t)doi.doi_data_block_size,
 	    (ulong_t)doi.doi_metadata_block_size,
 	    (u_longlong_t)doi.doi_bonus_size,
 	    (ulong_t)doi.doi_indirection,
 	    (ulong_t)zfs_refcount_count(&dn->dn_holds),
 	    (u_longlong_t)doi.doi_fill_count,
 	    (u_longlong_t)doi.doi_max_offset);
 
 	if (nwritten >= size)
 		return (size);
 
 	return (nwritten + 1);
 }
 
 static int
 dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
 {
 	dbuf_stats_t *dsh = (dbuf_stats_t *)data;
 	dbuf_hash_table_t *h = dsh->hash;
 	dmu_buf_impl_t *db;
 	int length, error = 0;
 
 	ASSERT3S(dsh->idx, >=, 0);
 	ASSERT3S(dsh->idx, <=, h->hash_table_mask);
 	if (size)
 		buf[0] = 0;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
 	for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
 		/*
 		 * Returning ENOMEM will cause the data and header functions
 		 * to be called with a larger scratch buffers.
 		 */
 		if (size < 512) {
 			error = SET_ERROR(ENOMEM);
 			break;
 		}
 
 		mutex_enter(&db->db_mtx);
 
 		if (db->db_state != DB_EVICTING) {
 			length = __dbuf_stats_hash_table_data(buf, size, db);
 			buf += length;
 			size -= length;
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
 
 	return (error);
 }
 
 static void *
 dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n)
 {
 	dbuf_stats_t *dsh = ksp->ks_private;
 
 	ASSERT(MUTEX_HELD(&dsh->lock));
 
 	if (n <= dsh->hash->hash_table_mask) {
 		dsh->idx = n;
 		return (dsh);
 	}
 
 	return (NULL);
 }
 
 static void
 dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
 {
 	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
 	kstat_t *ksp;
 
 	mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
 	dsh->hash = hash;
 
 	ksp = kstat_create("zfs", 0, "dbufs", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	dsh->kstat = ksp;
 
 	if (ksp) {
 		ksp->ks_lock = &dsh->lock;
 		ksp->ks_ndata = UINT32_MAX;
 		ksp->ks_private = dsh;
 		kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
 		    dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
 		kstat_install(ksp);
 	}
 }
 
 static void
 dbuf_stats_hash_table_destroy(void)
 {
 	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
 	kstat_t *ksp;
 
 	ksp = dsh->kstat;
 	if (ksp)
 		kstat_delete(ksp);
 
 	mutex_destroy(&dsh->lock);
 }
 
 void
 dbuf_stats_init(dbuf_hash_table_t *hash)
 {
 	dbuf_stats_hash_table_init(hash);
 }
 
 void
 dbuf_stats_destroy(void)
 {
 	dbuf_stats_hash_table_destroy();
 }
 
 ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW,
 	"Calculate arc header index");
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 79fd02dcb9aa..029d9df8af89 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -1,2706 +1,2719 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/range_tree.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_project.h>
 
 dnode_stats_t dnode_stats = {
 	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
 	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
 	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
 	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
 	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
 	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
 	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
 	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
 	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
 };
 
 dnode_sums_t dnode_sums;
 
 static kstat_t *dnode_ksp;
 static kmem_cache_t *dnode_cache;
 
 static dnode_phys_t dnode_phys_zero __maybe_unused;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
 #ifdef	_KERNEL
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
 static int
 dbuf_compare(const void *x1, const void *x2)
 {
 	const dmu_buf_impl_t *d1 = x1;
 	const dmu_buf_impl_t *d2 = x2;
 
 	int cmp = TREE_CMP(d1->db_level, d2->db_level);
 	if (likely(cmp))
 		return (cmp);
 
 	cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
 	if (likely(cmp))
 		return (cmp);
 
 	if (d1->db_state == DB_SEARCH) {
 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
 		return (-1);
 	} else if (d2->db_state == DB_SEARCH) {
 		ASSERT3S(d1->db_state, !=, DB_SEARCH);
 		return (1);
 	}
 
 	return (TREE_PCMP(d1, d2));
 }
 
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	dnode_t *dn = arg;
 
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 	cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
 
 	/*
 	 * Every dbuf has a reference, and dropping a tracked reference is
 	 * O(number of references), so don't track dn_holds.
 	 */
 	zfs_refcount_create_untracked(&dn->dn_holds);
 	zfs_refcount_create(&dn->dn_tx_holds);
 	list_link_init(&dn->dn_link);
 
 	memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));
 	memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));
 	memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));
 	memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));
 	memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));
 	memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));
 	memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));
 	memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));
 	memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		multilist_link_init(&dn->dn_dirty_link[i]);
 		dn->dn_free_ranges[i] = NULL;
 		list_create(&dn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 	dn->dn_dirty_txg = 0;
 	dn->dn_dirtyctx = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	dn->dn_bonus = NULL;
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_zio = NULL;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_id_flags = 0;
 
 	dn->dn_dbufs_count = 0;
 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	dn->dn_moved = 0;
 	return (0);
 }
 
 static void
 dnode_dest(void *arg, void *unused)
 {
 	(void) unused;
 	dnode_t *dn = arg;
 
 	rw_destroy(&dn->dn_struct_rwlock);
 	mutex_destroy(&dn->dn_mtx);
 	mutex_destroy(&dn->dn_dbufs_mtx);
 	cv_destroy(&dn->dn_notxholds);
 	cv_destroy(&dn->dn_nodnholds);
 	zfs_refcount_destroy(&dn->dn_holds);
 	zfs_refcount_destroy(&dn->dn_tx_holds);
 	ASSERT(!list_link_active(&dn->dn_link));
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 		list_destroy(&dn->dn_dirty_records[i]);
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT0(dn->dn_next_maxblkid[i]);
 	}
 
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_free_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT0(dn->dn_dirty_txg);
 	ASSERT0(dn->dn_dirtyctx);
 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
 	ASSERT3P(dn->dn_bonus, ==, NULL);
 	ASSERT(!dn->dn_have_spill);
 	ASSERT3P(dn->dn_zio, ==, NULL);
 	ASSERT0(dn->dn_oldused);
 	ASSERT0(dn->dn_oldflags);
 	ASSERT0(dn->dn_olduid);
 	ASSERT0(dn->dn_oldgid);
 	ASSERT0(dn->dn_oldprojid);
 	ASSERT0(dn->dn_newuid);
 	ASSERT0(dn->dn_newgid);
 	ASSERT0(dn->dn_newprojid);
 	ASSERT0(dn->dn_id_flags);
 
 	ASSERT0(dn->dn_dbufs_count);
 	avl_destroy(&dn->dn_dbufs);
 }
 
 static int
 dnode_kstats_update(kstat_t *ksp, int rw)
 {
 	dnode_stats_t *ds = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 	ds->dnode_hold_dbuf_hold.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);
 	ds->dnode_hold_dbuf_read.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_dbuf_read);
 	ds->dnode_hold_alloc_hits.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_hits);
 	ds->dnode_hold_alloc_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_misses);
 	ds->dnode_hold_alloc_interior.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_interior);
 	ds->dnode_hold_alloc_lock_retry.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);
 	ds->dnode_hold_alloc_lock_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);
 	ds->dnode_hold_alloc_type_none.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);
 	ds->dnode_hold_free_hits.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_hits);
 	ds->dnode_hold_free_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_misses);
 	ds->dnode_hold_free_lock_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);
 	ds->dnode_hold_free_lock_retry.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);
 	ds->dnode_hold_free_refcount.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_refcount);
 	ds->dnode_hold_free_overflow.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_overflow);
 	ds->dnode_free_interior_lock_retry.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);
 	ds->dnode_allocate.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_allocate);
 	ds->dnode_reallocate.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_reallocate);
 	ds->dnode_buf_evict.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_buf_evict);
 	ds->dnode_alloc_next_chunk.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_alloc_next_chunk);
 	ds->dnode_alloc_race.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_alloc_race);
 	ds->dnode_alloc_next_block.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_alloc_next_block);
 	ds->dnode_move_invalid.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_invalid);
 	ds->dnode_move_recheck1.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_recheck1);
 	ds->dnode_move_recheck2.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_recheck2);
 	ds->dnode_move_special.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_special);
 	ds->dnode_move_handle.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_handle);
 	ds->dnode_move_rwlock.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_rwlock);
 	ds->dnode_move_active.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_active);
 	return (0);
 }
 
 void
 dnode_init(void)
 {
 	ASSERT(dnode_cache == NULL);
 	dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(dnode_cache, dnode_move);
 
 	wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);
 	wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);
 	wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);
 	wmsum_init(&dnode_sums.dnode_allocate, 0);
 	wmsum_init(&dnode_sums.dnode_reallocate, 0);
 	wmsum_init(&dnode_sums.dnode_buf_evict, 0);
 	wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);
 	wmsum_init(&dnode_sums.dnode_alloc_race, 0);
 	wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);
 	wmsum_init(&dnode_sums.dnode_move_invalid, 0);
 	wmsum_init(&dnode_sums.dnode_move_recheck1, 0);
 	wmsum_init(&dnode_sums.dnode_move_recheck2, 0);
 	wmsum_init(&dnode_sums.dnode_move_special, 0);
 	wmsum_init(&dnode_sums.dnode_move_handle, 0);
 	wmsum_init(&dnode_sums.dnode_move_rwlock, 0);
 	wmsum_init(&dnode_sums.dnode_move_active, 0);
 
 	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dnode_ksp != NULL) {
 		dnode_ksp->ks_data = &dnode_stats;
 		dnode_ksp->ks_update = dnode_kstats_update;
 		kstat_install(dnode_ksp);
 	}
 }
 
 void
 dnode_fini(void)
 {
 	if (dnode_ksp != NULL) {
 		kstat_delete(dnode_ksp);
 		dnode_ksp = NULL;
 	}
 
 	wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);
 	wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);
 	wmsum_fini(&dnode_sums.dnode_hold_free_hits);
 	wmsum_fini(&dnode_sums.dnode_hold_free_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);
 	wmsum_fini(&dnode_sums.dnode_hold_free_refcount);
 	wmsum_fini(&dnode_sums.dnode_hold_free_overflow);
 	wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);
 	wmsum_fini(&dnode_sums.dnode_allocate);
 	wmsum_fini(&dnode_sums.dnode_reallocate);
 	wmsum_fini(&dnode_sums.dnode_buf_evict);
 	wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);
 	wmsum_fini(&dnode_sums.dnode_alloc_race);
 	wmsum_fini(&dnode_sums.dnode_alloc_next_block);
 	wmsum_fini(&dnode_sums.dnode_move_invalid);
 	wmsum_fini(&dnode_sums.dnode_move_recheck1);
 	wmsum_fini(&dnode_sums.dnode_move_recheck2);
 	wmsum_fini(&dnode_sums.dnode_move_special);
 	wmsum_fini(&dnode_sums.dnode_move_handle);
 	wmsum_fini(&dnode_sums.dnode_move_rwlock);
 	wmsum_fini(&dnode_sums.dnode_move_active);
 
 	kmem_cache_destroy(dnode_cache);
 	dnode_cache = NULL;
 }
 
 
 #ifdef ZFS_DEBUG
 void
 dnode_verify(dnode_t *dn)
 {
 	int drop_struct_lock = FALSE;
 
 	ASSERT(dn->dn_phys);
 	ASSERT(dn->dn_objset);
 	ASSERT(dn->dn_handle->dnh_dnode == dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 		return;
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
 		int i;
 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
 		if (dn->dn_datablkshift) {
 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 		}
 		ASSERT3U(dn->dn_nlevels, <=, 30);
 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 		ASSERT3U(dn->dn_nblkptr, >=, 1);
 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
 		ASSERT3U(dn->dn_datablksz, ==,
 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
 		    dn->dn_bonuslen, <=, max_bonuslen);
 		for (i = 0; i < TXG_SIZE; i++) {
 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
 		}
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
 	}
 	if (drop_struct_lock)
 		rw_exit(&dn->dn_struct_rwlock);
 }
 #endif
 
 void
 dnode_byteswap(dnode_phys_t *dnp)
 {
 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
 	int i;
 
 	if (dnp->dn_type == DMU_OT_NONE) {
 		memset(dnp, 0, sizeof (dnode_phys_t));
 		return;
 	}
 
 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
 	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
 	dnp->dn_used = BSWAP_64(dnp->dn_used);
 
 	/*
 	 * dn_nblkptr is only one byte, so it's OK to read it in either
 	 * byte order.  We can't read dn_bouslen.
 	 */
 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
 		buf64[i] = BSWAP_64(buf64[i]);
 
 	/*
 	 * OK to check dn_bonuslen for zero, because it won't matter if
 	 * we have the wrong byte order.  This is necessary because the
 	 * dnode dnode is smaller than a regular dnode.
 	 */
 	if (dnp->dn_bonuslen != 0) {
 		dmu_object_byteswap_t byteswap;
 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
 		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
 		dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
 		    DN_MAX_BONUS_LEN(dnp));
 	}
 
 	/* Swap SPILL block if we have one */
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
 	int i = 0;
 
 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
 	while (i < size) {
 		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
 		dnode_byteswap(dnp);
 
 		i += DNODE_MIN_SIZE;
 		if (dnp->dn_type != DMU_OT_NONE)
 			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
 	}
 }
 
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
 
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 
 	if (newsize < dn->dn_bonuslen) {
 		/* clear any data after the end of the new size */
 		size_t diff = dn->dn_bonuslen - newsize;
 		char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
 		memset(data_end, 0, diff);
 	}
 
 	dn->dn_bonuslen = newsize;
 	if (newsize == 0)
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
 	else
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 {
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dn->dn_bonustype = newtype;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 	dnode_setdirty(dn, tx);
 	dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;
 	dn->dn_have_spill = B_FALSE;
 }
 
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 	dn->dn_datablksz = size;
 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
 	dnode_t *dn;
 
 	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 	dn->dn_moved = 0;
 
 	/*
 	 * Defer setting dn_objset until the dnode is ready to be a candidate
 	 * for the dnode_move() callback.
 	 */
 	dn->dn_object = object;
 	dn->dn_dbuf = db;
 	dn->dn_handle = dnh;
 	dn->dn_phys = dnp;
 
 	if (dnp->dn_datablkszsec) {
 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 	} else {
 		dn->dn_datablksz = 0;
 		dn->dn_datablkszsec = 0;
 		dn->dn_datablkshift = 0;
 	}
 	dn->dn_indblkshift = dnp->dn_indblkshift;
 	dn->dn_nlevels = dnp->dn_nlevels;
 	dn->dn_type = dnp->dn_type;
 	dn->dn_nblkptr = dnp->dn_nblkptr;
 	dn->dn_checksum = dnp->dn_checksum;
 	dn->dn_compress = dnp->dn_compress;
 	dn->dn_bonustype = dnp->dn_bonustype;
 	dn->dn_bonuslen = dnp->dn_bonuslen;
 	dn->dn_num_slots = dnp->dn_extra_slots + 1;
 	dn->dn_maxblkid = dnp->dn_maxblkid;
 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 	dn->dn_id_flags = 0;
 
 	dmu_zfetch_init(&dn->dn_zfetch, dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
 	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
 
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
 	 * signifies that the special dnodes have no references from
 	 * their children (the entries in os_dnodes).  This allows
 	 * dnode_destroy() to easily determine if the last child has
 	 * been removed and then complete eviction of the objset.
 	 */
 	if (!DMU_OBJECT_IS_SPECIAL(object))
 		list_insert_head(&os->os_dnodes, dn);
 	membar_producer();
 
 	/*
 	 * Everything else must be valid before assigning dn_objset
 	 * makes the dnode eligible for dnode_move().
 	 */
 	dn->dn_objset = os;
 
 	dnh->dnh_dnode = dn;
 	mutex_exit(&os->os_lock);
 
 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
 
 	return (dn);
 }
 
 /*
  * Caller must be holding the dnode handle, which is released upon return.
  */
 static void
 dnode_destroy(dnode_t *dn)
 {
 	objset_t *os = dn->dn_objset;
 	boolean_t complete_os_eviction = B_FALSE;
 
 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
 	mutex_enter(&os->os_lock);
 	POINTER_INVALIDATE(&dn->dn_objset);
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		list_remove(&os->os_dnodes, dn);
 		complete_os_eviction =
 		    list_is_empty(&os->os_dnodes) &&
 		    list_link_active(&os->os_evicting_node);
 	}
 	mutex_exit(&os->os_lock);
 
 	/* the dnode can no longer move, so we can release the handle */
 	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
 		zrl_remove(&dn->dn_handle->dnh_zrlock);
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 	dn->dn_dirty_txg = 0;
 
 	dn->dn_dirtyctx = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
 		dbuf_destroy(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
 	dn->dn_zio = NULL;
 
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_id_flags = 0;
 
 	dmu_zfetch_fini(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
 	arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
 
 	if (complete_os_eviction)
 		dmu_objset_evict_done(os);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
 	int i;
 
 	ASSERT3U(dn_slots, >, 0);
 	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
 	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
 	if (ibs == 0)
 		ibs = zfs_default_ibs;
 
 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
 	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
 	    dn->dn_objset, (u_longlong_t)dn->dn_object,
 	    (u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots);
 	DNODE_STAT_BUMP(dnode_allocate);
 
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));
 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 	ASSERT(ot != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
 	    (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT0(dn->dn_maxblkid);
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
 	ASSERT(avl_is_empty(&dn->dn_dbufs));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT0(dn->dn_next_maxblkid[i]);
 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 	}
 
 	dn->dn_type = ot;
 	dnode_setdblksz(dn, blocksize);
 	dn->dn_indblkshift = ibs;
 	dn->dn_nlevels = 1;
 	dn->dn_num_slots = dn_slots;
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		dn->dn_nblkptr = 1;
 	else {
 		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
 		    SPA_BLKPTRSHIFT));
 	}
 
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	dn->dn_dirtyctx = 0;
 
 	dn->dn_free_txg = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	dn->dn_dirty_txg = 0;
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	dn->dn_id_flags = 0;
 
 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, int dn_slots,
     boolean_t keep_spill, dmu_tx_t *tx)
 {
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=,
 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
 	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
 
 	dnode_free_interior_slots(dn);
 	DNODE_STAT_BUMP(dnode_reallocate);
 
 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);
 
 	dn->dn_id_flags = 0;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_setdirty(dn, tx);
 	if (dn->dn_datablksz != blocksize) {
 		/* change blocksize */
 		ASSERT0(dn->dn_maxblkid);
 		ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 		    dnode_block_freed(dn, 0));
 
 		dnode_setdblksz(dn, blocksize);
 		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;
 	}
 	if (dn->dn_bonuslen != bonuslen)
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;
 
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		nblkptr = 1;
 	else
 		nblkptr = MIN(DN_MAX_NBLKPTR,
 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
 		    SPA_BLKPTRSHIFT));
 	if (dn->dn_bonustype != bonustype)
 		dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
 	if (dn->dn_nblkptr != nblkptr)
 		dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
 		dbuf_rm_spill(dn, tx);
 		dnode_rm_spill(dn, tx);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* change type */
 	dn->dn_type = ot;
 
 	/* change bonus size and type */
 	mutex_enter(&dn->dn_mtx);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_num_slots = dn_slots;
 	dn->dn_nblkptr = nblkptr;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 
 	/* fix up the bonus db_size */
 	if (dn->dn_bonus) {
 		dn->dn_bonus->db.db_size =
 		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
 	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 }
 
 #ifdef	_KERNEL
 static void
 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 {
 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
 
 	/* Copy fields. */
 	ndn->dn_objset = odn->dn_objset;
 	ndn->dn_object = odn->dn_object;
 	ndn->dn_dbuf = odn->dn_dbuf;
 	ndn->dn_handle = odn->dn_handle;
 	ndn->dn_phys = odn->dn_phys;
 	ndn->dn_type = odn->dn_type;
 	ndn->dn_bonuslen = odn->dn_bonuslen;
 	ndn->dn_bonustype = odn->dn_bonustype;
 	ndn->dn_nblkptr = odn->dn_nblkptr;
 	ndn->dn_checksum = odn->dn_checksum;
 	ndn->dn_compress = odn->dn_compress;
 	ndn->dn_nlevels = odn->dn_nlevels;
 	ndn->dn_indblkshift = odn->dn_indblkshift;
 	ndn->dn_datablkshift = odn->dn_datablkshift;
 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
 	ndn->dn_datablksz = odn->dn_datablksz;
 	ndn->dn_maxblkid = odn->dn_maxblkid;
 	ndn->dn_num_slots = odn->dn_num_slots;
 	memcpy(ndn->dn_next_type, odn->dn_next_type,
 	    sizeof (odn->dn_next_type));
 	memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,
 	    sizeof (odn->dn_next_nblkptr));
 	memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,
 	    sizeof (odn->dn_next_nlevels));
 	memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,
 	    sizeof (odn->dn_next_indblkshift));
 	memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,
 	    sizeof (odn->dn_next_bonustype));
 	memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,
 	    sizeof (odn->dn_rm_spillblk));
 	memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,
 	    sizeof (odn->dn_next_bonuslen));
 	memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,
 	    sizeof (odn->dn_next_blksz));
 	memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,
 	    sizeof (odn->dn_next_maxblkid));
 	for (int i = 0; i < TXG_SIZE; i++) {
 		list_move_tail(&ndn->dn_dirty_records[i],
 		    &odn->dn_dirty_records[i]);
 	}
 	memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,
 	    sizeof (odn->dn_free_ranges));
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
 	ndn->dn_dirty_txg = odn->dn_dirty_txg;
 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
 	ndn->dn_bonus = odn->dn_bonus;
 	ndn->dn_have_spill = odn->dn_have_spill;
 	ndn->dn_zio = odn->dn_zio;
 	ndn->dn_oldused = odn->dn_oldused;
 	ndn->dn_oldflags = odn->dn_oldflags;
 	ndn->dn_olduid = odn->dn_olduid;
 	ndn->dn_oldgid = odn->dn_oldgid;
 	ndn->dn_oldprojid = odn->dn_oldprojid;
 	ndn->dn_newuid = odn->dn_newuid;
 	ndn->dn_newgid = odn->dn_newgid;
 	ndn->dn_newprojid = odn->dn_newprojid;
 	ndn->dn_id_flags = odn->dn_id_flags;
 	dmu_zfetch_init(&ndn->dn_zfetch, ndn);
 
 	/*
 	 * Update back pointers. Updating the handle fixes the back pointer of
 	 * every descendant dbuf as well as the bonus dbuf.
 	 */
 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
 	ndn->dn_handle->dnh_dnode = ndn;
 
 	/*
 	 * Invalidate the original dnode by clearing all of its back pointers.
 	 */
 	odn->dn_dbuf = NULL;
 	odn->dn_handle = NULL;
 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 	odn->dn_dbufs_count = 0;
 	odn->dn_bonus = NULL;
 	dmu_zfetch_fini(&odn->dn_zfetch);
 
 	/*
 	 * Set the low bit of the objset pointer to ensure that dnode_move()
 	 * recognizes the dnode as invalid in any subsequent callback.
 	 */
 	POINTER_INVALIDATE(&odn->dn_objset);
 
 	/*
 	 * Satisfy the destructor.
 	 */
 	for (int i = 0; i < TXG_SIZE; i++) {
 		list_create(&odn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 		odn->dn_free_ranges[i] = NULL;
 		odn->dn_next_nlevels[i] = 0;
 		odn->dn_next_indblkshift[i] = 0;
 		odn->dn_next_bonustype[i] = 0;
 		odn->dn_rm_spillblk[i] = 0;
 		odn->dn_next_bonuslen[i] = 0;
 		odn->dn_next_blksz[i] = 0;
 	}
 	odn->dn_allocated_txg = 0;
 	odn->dn_free_txg = 0;
 	odn->dn_assigned_txg = 0;
 	odn->dn_dirty_txg = 0;
 	odn->dn_dirtyctx = 0;
 	odn->dn_dirtyctx_firstset = NULL;
 	odn->dn_have_spill = B_FALSE;
 	odn->dn_zio = NULL;
 	odn->dn_oldused = 0;
 	odn->dn_oldflags = 0;
 	odn->dn_olduid = 0;
 	odn->dn_oldgid = 0;
 	odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
 	odn->dn_newuid = 0;
 	odn->dn_newgid = 0;
 	odn->dn_newprojid = ZFS_DEFAULT_PROJID;
 	odn->dn_id_flags = 0;
 
 	/*
 	 * Mark the dnode.
 	 */
 	ndn->dn_moved = 1;
 	odn->dn_moved = (uint8_t)-1;
 }
 
 static kmem_cbrc_t
 dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 {
 	dnode_t *odn = buf, *ndn = newbuf;
 	objset_t *os;
 	int64_t refcount;
 	uint32_t dbufs;
 
 	/*
 	 * The dnode is on the objset's list of known dnodes if the objset
 	 * pointer is valid. We set the low bit of the objset pointer when
 	 * freeing the dnode to invalidate it, and the memory patterns written
 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
 	 * A newly created dnode sets the objset pointer last of all to indicate
 	 * that the dnode is known and in a valid state to be moved by this
 	 * function.
 	 */
 	os = odn->dn_objset;
 	if (!POINTER_IS_VALID(os)) {
 		DNODE_STAT_BUMP(dnode_move_invalid);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * Ensure that the objset does not go away during the move.
 	 */
 	rw_enter(&os_lock, RW_WRITER);
 	if (os != odn->dn_objset) {
 		rw_exit(&os_lock);
 		DNODE_STAT_BUMP(dnode_move_recheck1);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * If the dnode is still valid, then so is the objset. We know that no
 	 * valid objset can be freed while we hold os_lock, so we can safely
 	 * ensure that the objset remains in use.
 	 */
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * Recheck the objset pointer in case the dnode was removed just before
 	 * acquiring the lock.
 	 */
 	if (os != odn->dn_objset) {
 		mutex_exit(&os->os_lock);
 		rw_exit(&os_lock);
 		DNODE_STAT_BUMP(dnode_move_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * At this point we know that as long as we hold os->os_lock, the dnode
 	 * cannot be freed and fields within the dnode can be safely accessed.
 	 * The objset listing this dnode cannot go away as long as this dnode is
 	 * on its list.
 	 */
 	rw_exit(&os_lock);
 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_special);
 		return (KMEM_CBRC_NO);
 	}
 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
 
 	/*
 	 * Lock the dnode handle to prevent the dnode from obtaining any new
 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
 	 * from accessing the dnode, so that we can discount their holds. The
 	 * handle is safe to access because we know that while the dnode cannot
 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
 	 * safely move any dnode referenced only by dbufs.
 	 */
 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_handle);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
 	 * We need to guarantee that there is a hold for every dbuf in order to
 	 * determine whether the dnode is actively referenced. Falsely matching
 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
 	 * that a thread already having an active dnode hold is about to add a
 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
 	 * progress.
 	 */
 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_rwlock);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
 	 * case, the dbuf count is decremented under the handle lock before the
 	 * dbuf's hold is released. This order ensures that if we count the hold
 	 * after the dbuf is removed but before its hold is released, we will
 	 * treat the unmatched hold as active and exit safely. If we count the
 	 * hold before the dbuf is removed, the hold is discounted, and the
 	 * removal is blocked until the move completes.
 	 */
 	refcount = zfs_refcount_count(&odn->dn_holds);
 	ASSERT(refcount >= 0);
 	dbufs = DN_DBUFS_COUNT(odn);
 
 	/* We can't have more dbufs than dnode holds. */
 	ASSERT3U(dbufs, <=, refcount);
 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
 	    uint32_t, dbufs);
 
 	if (refcount > dbufs) {
 		rw_exit(&odn->dn_struct_rwlock);
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_active);
 		return (KMEM_CBRC_LATER);
 	}
 
 	rw_exit(&odn->dn_struct_rwlock);
 
 	/*
 	 * At this point we know that anyone with a hold on the dnode is not
 	 * actively referencing it. The dnode is known and in a valid state to
 	 * move. We're holding the locks needed to execute the critical section.
 	 */
 	dnode_move_impl(odn, ndn);
 
 	list_link_replace(&odn->dn_link, &ndn->dn_link);
 	/* If the dnode was safe to move, the refcount cannot have changed. */
 	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
 	ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 	mutex_exit(&os->os_lock);
 
 	return (KMEM_CBRC_YES);
 }
 #endif	/* _KERNEL */
 
 static void
 dnode_slots_hold(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 		zrl_add(&dnh->dnh_zrlock);
 	}
 }
 
 static void
 dnode_slots_rele(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 
 		if (zrl_is_locked(&dnh->dnh_zrlock))
 			zrl_exit(&dnh->dnh_zrlock);
 		else
 			zrl_remove(&dnh->dnh_zrlock);
 	}
 }
 
 static int
 dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 
 		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
 			for (int j = idx; j < i; j++) {
 				dnh = &children->dnc_children[j];
 				zrl_exit(&dnh->dnh_zrlock);
 			}
 
 			return (0);
 		}
 	}
 
 	return (1);
 }
 
 static void
 dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 		dnh->dnh_dnode = ptr;
 	}
 }
 
 static boolean_t
 dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	/*
 	 * If all dnode slots are either already free or
 	 * evictable return B_TRUE.
 	 */
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 		dnode_t *dn = dnh->dnh_dnode;
 
 		if (dn == DN_SLOT_FREE) {
 			continue;
 		} else if (DN_SLOT_IS_PTR(dn)) {
 			mutex_enter(&dn->dn_mtx);
 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
 			    zfs_refcount_is_zero(&dn->dn_holds) &&
 			    !DNODE_IS_DIRTY(dn));
 			mutex_exit(&dn->dn_mtx);
 
 			if (!can_free)
 				return (B_FALSE);
 			else
 				continue;
 		} else {
 			return (B_FALSE);
 		}
 	}
 
 	return (B_TRUE);
 }
 
-static void
+static uint_t
 dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
 {
+	uint_t reclaimed = 0;
+
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 
 		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
 
 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
 			dnode_destroy(dnh->dnh_dnode);
 			dnh->dnh_dnode = DN_SLOT_FREE;
+			reclaimed++;
 		}
 	}
+
+	return (reclaimed);
 }
 
 void
 dnode_free_interior_slots(dnode_t *dn)
 {
 	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
 	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
 	int idx = (dn->dn_object & (epb - 1)) + 1;
 	int slots = dn->dn_num_slots - 1;
 
 	if (slots == 0)
 		return;
 
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	while (!dnode_slots_tryenter(children, idx, slots)) {
 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
 		kpreempt(KPREEMPT_SYNC);
 	}
 
 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
 	dnode_slots_rele(children, idx, slots);
 }
 
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
 	dnode_t *dn = dnh->dnh_dnode;
 
 	/*
 	 * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
 	 * zfs_refcount_remove()
 	 */
 	mutex_enter(&dn->dn_mtx);
 	if (zfs_refcount_count(&dn->dn_holds) > 0)
 		cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
 	mutex_exit(&dn->dn_mtx);
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
 
 	ASSERT(dn->dn_dbuf == NULL ||
 	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
 	zrl_add(&dnh->dnh_zrlock);
 	dnode_destroy(dn); /* implicit zrl_remove() */
 	zrl_destroy(&dnh->dnh_zrlock);
 	dnh->dnh_dnode = NULL;
 }
 
 void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
 	dnode_t *dn;
 
 	zrl_init(&dnh->dnh_zrlock);
 	VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
 
 	dn = dnode_create(os, dnp, NULL, object, dnh);
 	DNODE_VERIFY(dn);
 
 	zrl_exit(&dnh->dnh_zrlock);
 }
 
 static void
 dnode_buf_evict_async(void *dbu)
 {
 	dnode_children_t *dnc = dbu;
 
 	DNODE_STAT_BUMP(dnode_buf_evict);
 
 	for (int i = 0; i < dnc->dnc_count; i++) {
 		dnode_handle_t *dnh = &dnc->dnc_children[i];
 		dnode_t *dn;
 
 		/*
 		 * The dnode handle lock guards against the dnode moving to
 		 * another valid address, so there is no need here to guard
 		 * against changes to or from NULL.
 		 */
 		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			zrl_destroy(&dnh->dnh_zrlock);
 			dnh->dnh_dnode = DN_SLOT_UNINIT;
 			continue;
 		}
 
 		zrl_add(&dnh->dnh_zrlock);
 		dn = dnh->dnh_dnode;
 		/*
 		 * If there are holds on this dnode, then there should
 		 * be holds on the dnode's containing dbuf as well; thus
 		 * it wouldn't be eligible for eviction and this function
 		 * would not have been called.
 		 */
 		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
 		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
 
 		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
 		zrl_destroy(&dnh->dnh_zrlock);
 		dnh->dnh_dnode = DN_SLOT_UNINIT;
 	}
 	kmem_free(dnc, sizeof (dnode_children_t) +
 	    dnc->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
  * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
  * to ensure the hole at the specified object offset is large enough to
  * hold the dnode being created. The slots parameter is also used to ensure
  * a dnode does not span multiple dnode blocks. In both of these cases, if
  * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
  * are only possible when using DNODE_MUST_BE_FREE.
  *
  * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
  * dnode_hold_impl() will check if the requested dnode is already consumed
  * as an extra dnode slot by an large dnode, in which case it returns
  * ENOENT.
  *
  * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
  * return whether the hold would succeed or not. tag and dnp should set to
  * NULL in this case.
  *
  * errors:
  * EINVAL - Invalid object number or flags.
  * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
  * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
  *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
  *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
  * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
  *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
  * EIO    - I/O error when reading the meta dnode dbuf.
  *
  * succeeds even for free dnodes.
  */
 int
 dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
     const void *tag, dnode_t **dnp)
 {
 	int epb, idx, err;
 	int drop_struct_lock = FALSE;
 	int type;
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
 	dnode_children_t *dnc;
 	dnode_phys_t *dn_block;
 	dnode_handle_t *dnh;
 
 	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
 	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
 	IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
 
 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
 	 * be asking the DMU to do *anything* unless it's the root pool
 	 * which may require us to read from the root filesystem while
 	 * holding some (not all) of the locks as writer.
 	 */
 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
 	    (spa_is_root(os->os_spa) &&
 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
 	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
 
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
 	    object == DMU_PROJECTUSED_OBJECT) {
 		if (object == DMU_USERUSED_OBJECT)
 			dn = DMU_USERUSED_DNODE(os);
 		else if (object == DMU_GROUPUSED_OBJECT)
 			dn = DMU_GROUPUSED_DNODE(os);
 		else
 			dn = DMU_PROJECTUSED_DNODE(os);
 		if (dn == NULL)
 			return (SET_ERROR(ENOENT));
 		type = dn->dn_type;
 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
 			return (SET_ERROR(ENOENT));
 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
 			return (SET_ERROR(EEXIST));
 		DNODE_VERIFY(dn);
 		/* Don't actually hold if dry run, just return 0 */
 		if (!(flag & DNODE_DRY_RUN)) {
 			(void) zfs_refcount_add(&dn->dn_holds, tag);
 			*dnp = dn;
 		}
 		return (0);
 	}
 
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (SET_ERROR(EINVAL));
 
 	mdn = DMU_META_DNODE(os);
 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
 
 	DNODE_VERIFY(mdn);
 
 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
 		rw_exit(&mdn->dn_struct_rwlock);
 	if (db == NULL) {
 		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * We do not need to decrypt to read the dnode so it doesn't matter
 	 * if we get the encrypted or decrypted version.
 	 */
 	err = dbuf_read(db, NULL, DB_RF_CANFAIL |
 	    DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
 	if (err) {
 		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
 		dbuf_rele(db, FTAG);
 		return (err);
 	}
 
 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
 	epb = db->db.db_size >> DNODE_SHIFT;
 
 	idx = object & (epb - 1);
 	dn_block = (dnode_phys_t *)db->db.db_data;
 
 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
 	dnc = dmu_buf_get_user(&db->db);
 	dnh = NULL;
 	if (dnc == NULL) {
 		dnode_children_t *winner;
 		int skip = 0;
 
 		dnc = kmem_zalloc(sizeof (dnode_children_t) +
 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
 		dnc->dnc_count = epb;
 		dnh = &dnc->dnc_children[0];
 
 		/* Initialize dnode slot status from dnode_phys_t */
 		for (int i = 0; i < epb; i++) {
 			zrl_init(&dnh[i].dnh_zrlock);
 
 			if (skip) {
 				skip--;
 				continue;
 			}
 
 			if (dn_block[i].dn_type != DMU_OT_NONE) {
 				int interior = dn_block[i].dn_extra_slots;
 
 				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
 				dnode_set_slots(dnc, i + 1, interior,
 				    DN_SLOT_INTERIOR);
 				skip = interior;
 			} else {
 				dnh[i].dnh_dnode = DN_SLOT_FREE;
 				skip = 0;
 			}
 		}
 
 		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
 		    dnode_buf_evict_async, NULL);
 		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
 		if (winner != NULL) {
 
 			for (int i = 0; i < epb; i++)
 				zrl_destroy(&dnh[i].dnh_zrlock);
 
 			kmem_free(dnc, sizeof (dnode_children_t) +
 			    epb * sizeof (dnode_handle_t));
 			dnc = winner;
 		}
 	}
 
 	ASSERT(dnc->dnc_count == epb);
 
 	if (flag & DNODE_MUST_BE_ALLOCATED) {
 		slots = 1;
 
 		dnode_slots_hold(dnc, idx, slots);
 		dnh = &dnc->dnc_children[idx];
 
 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			dn = dnh->dnh_dnode;
 		} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
 			DNODE_STAT_BUMP(dnode_hold_alloc_interior);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(EEXIST));
 		} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
 			DNODE_STAT_BUMP(dnode_hold_alloc_misses);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOENT));
 		} else {
 			dnode_slots_rele(dnc, idx, slots);
 			while (!dnode_slots_tryenter(dnc, idx, slots)) {
 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
 				kpreempt(KPREEMPT_SYNC);
 			}
 
 			/*
 			 * Someone else won the race and called dnode_create()
 			 * after we checked DN_SLOT_IS_PTR() above but before
 			 * we acquired the lock.
 			 */
 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
 				dn = dnh->dnh_dnode;
 			} else {
 				dn = dnode_create(os, dn_block + idx, db,
 				    object, dnh);
+				dmu_buf_add_user_size(&db->db,
+				    sizeof (dnode_t));
 			}
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
 			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		/* Don't actually hold if dry run, just return 0 */
 		if (flag & DNODE_DRY_RUN) {
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (0);
 		}
 
 		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
 	} else if (flag & DNODE_MUST_BE_FREE) {
 
 		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
 			DNODE_STAT_BUMP(dnode_hold_free_overflow);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 
 		dnode_slots_hold(dnc, idx, slots);
 
 		if (!dnode_check_slots_free(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_misses);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 
 		dnode_slots_rele(dnc, idx, slots);
 		while (!dnode_slots_tryenter(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
 			kpreempt(KPREEMPT_SYNC);
 		}
 
 		if (!dnode_check_slots_free(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 
 		/*
 		 * Allocated but otherwise free dnodes which would
 		 * be in the interior of a multi-slot dnodes need
 		 * to be freed.  Single slot dnodes can be safely
 		 * re-purposed as a performance optimization.
 		 */
-		if (slots > 1)
-			dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+		if (slots > 1) {
+			uint_t reclaimed =
+			    dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+			if (reclaimed > 0)
+				dmu_buf_sub_user_size(&db->db,
+				    reclaimed * sizeof (dnode_t));
+		}
 
 		dnh = &dnc->dnc_children[idx];
 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			dn = dnh->dnh_dnode;
 		} else {
 			dn = dnode_create(os, dn_block + idx, db,
 			    object, dnh);
+			dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
 			DNODE_STAT_BUMP(dnode_hold_free_refcount);
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(EEXIST));
 		}
 
 		/* Don't actually hold if dry run, just return 0 */
 		if (flag & DNODE_DRY_RUN) {
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (0);
 		}
 
 		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
 		DNODE_STAT_BUMP(dnode_hold_free_hits);
 	} else {
 		dbuf_rele(db, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	ASSERT0(dn->dn_free_txg);
 
 	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dnh);
 
 	mutex_exit(&dn->dn_mtx);
 
 	/* Now we can rely on the hold to prevent the dnode from moving. */
 	dnode_slots_rele(dnc, idx, slots);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dnp, !=, NULL);
 	ASSERT3P(dn->dn_dbuf, ==, db);
 	ASSERT3U(dn->dn_object, ==, object);
 	dbuf_rele(db, FTAG);
 
 	*dnp = dn;
 	return (0);
 }
 
 /*
  * Return held dnode if the object is allocated, NULL if not.
  */
 int
 dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
 	    dnp));
 }
 
 /*
  * Can only add a reference if there is already at least one
  * reference on the dnode.  Returns FALSE if unable to add a
  * new reference.
  */
 boolean_t
 dnode_add_ref(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	if (zfs_refcount_is_zero(&dn->dn_holds)) {
 		mutex_exit(&dn->dn_mtx);
 		return (FALSE);
 	}
 	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
 	mutex_exit(&dn->dn_mtx);
 	return (TRUE);
 }
 
 void
 dnode_rele(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	dnode_rele_and_unlock(dn, tag, B_FALSE);
 }
 
 void
 dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
 	dmu_buf_impl_t *db = dn->dn_dbuf;
 	dnode_handle_t *dnh = dn->dn_handle;
 
 	refs = zfs_refcount_remove(&dn->dn_holds, tag);
 	if (refs == 0)
 		cv_broadcast(&dn->dn_nodnholds);
 	mutex_exit(&dn->dn_mtx);
 	/* dnode could get destroyed at this point, so don't use it anymore */
 
 	/*
 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
 	 * indirectly by dbuf_rele() while relying on the dnode handle to
 	 * prevent the dnode from moving, since releasing the last hold could
 	 * result in the dnode's parent dbuf evicting its dnode handles. For
 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
 	 * other direct or indirect hold on the dnode must first drop the dnode
 	 * handle.
 	 */
 #ifdef ZFS_DEBUG
 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
 #endif
 
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && db != NULL) {
 		/*
 		 * Another thread could add a hold to the dnode handle in
 		 * dnode_hold_impl() while holding the parent dbuf. Since the
 		 * hold on the parent dbuf prevents the handle from being
 		 * destroyed, the hold on the handle is OK. We can't yet assert
 		 * that the handle has zero references, but that will be
 		 * asserted anyway when the handle gets destroyed.
 		 */
 		mutex_enter(&db->db_mtx);
 		dbuf_rele_and_unlock(db, dnh, evicting);
 	}
 }
 
 /*
  * Test whether we can create a dnode at the specified location.
  */
 int
 dnode_try_claim(objset_t *os, uint64_t object, int slots)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
 	    slots, NULL, NULL));
 }
 
 /*
  * Checks if the dnode contains any uncommitted dirty records.
  */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
 {
 	mutex_enter(&dn->dn_mtx);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		if (multilist_link_active(&dn->dn_dirty_link[i])) {
 			mutex_exit(&dn->dn_mtx);
 			return (B_TRUE);
 		}
 	}
 
 	mutex_exit(&dn->dn_mtx);
 
 	return (B_FALSE);
 }
 
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
 		return;
 	}
 
 	DNODE_VERIFY(dn);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&dn->dn_mtx);
 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
 	mutex_exit(&dn->dn_mtx);
 #endif
 
 	/*
 	 * Determine old uid/gid when necessary
 	 */
 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
 
 	multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK];
 	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
 
 	/*
 	 * If we are already marked dirty, we're done.
 	 */
 	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
 		multilist_sublist_unlock(mls);
 		return;
 	}
 
 	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
 	    !avl_is_empty(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
 	ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);
 	ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);
 	ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
 	    (u_longlong_t)dn->dn_object, (u_longlong_t)txg);
 
 	multilist_sublist_insert_head(mls, dn);
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * The dnode maintains a hold on its containing dbuf as
 	 * long as there are holds on it.  Each instantiated child
 	 * dbuf maintains a hold on the dnode.  When the last child
 	 * drops its hold, the dnode will drop its hold on the
 	 * containing dbuf. We add a "dirty hold" here so that the
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
 }
 
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)
 {
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 	dn->dn_free_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 
 	dnode_setdirty(dn, tx);
 }
 
 /*
  * Try to change the block size for the indicated dnode.  This can only
  * succeed if there are no blocks allocated or dirty beyond first block
  */
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	int err;
 
 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
 	else
 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;
 
 	if (size == dn->dn_datablksz && ibs == 0)
 		return (0);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* Check for any allocated blocks beyond the first */
 	if (dn->dn_maxblkid != 0)
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
 	dnode_setdirty(dn, tx);
 	if (size != dn->dn_datablksz) {
 		/* resize the old block */
 		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 		if (err == 0) {
 			dbuf_new_size(db, size, tx);
 		} else if (err != ENOENT) {
 			goto fail;
 		}
 
 		dnode_setdblksz(dn, size);
 		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
 		if (db)
 			dbuf_rele(db, FTAG);
 	}
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
 		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);
 
 fail:
 	rw_exit(&dn->dn_struct_rwlock);
 	return (SET_ERROR(ENOTSUP));
 }
 
 static void
 dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
 {
 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
 	int old_nlevels = dn->dn_nlevels;
 	dmu_buf_impl_t *db;
 	list_t *list;
 	dbuf_dirty_record_t *new, *dr, *dr_next;
 
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT3U(new_nlevels, >, dn->dn_nlevels);
 	dn->dn_nlevels = new_nlevels;
 
 	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
 	dn->dn_next_nlevels[txgoff] = new_nlevels;
 
 	/* dirty the left indirects */
 	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
 	ASSERT(db != NULL);
 	new = dbuf_dirty(db, tx);
 	dbuf_rele(db, FTAG);
 
 	/* transfer the dirty records to the new indirect */
 	mutex_enter(&dn->dn_mtx);
 	mutex_enter(&new->dt.di.dr_mtx);
 	list = &dn->dn_dirty_records[txgoff];
 	for (dr = list_head(list); dr; dr = dr_next) {
 		dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
 
 		IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
 		if (dr->dr_dbuf == NULL ||
 		    (dr->dr_dbuf->db_level == old_nlevels - 1 &&
 		    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
 			list_remove(&dn->dn_dirty_records[txgoff], dr);
 			list_insert_tail(&new->dt.di.dr_children, dr);
 			dr->dr_parent = new;
 		}
 	}
 	mutex_exit(&new->dt.di.dr_mtx);
 	mutex_exit(&dn->dn_mtx);
 }
 
 int
 dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
 {
 	int ret = 0;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	if (dn->dn_nlevels == nlevels) {
 		ret = 0;
 		goto out;
 	} else if (nlevels < dn->dn_nlevels) {
 		ret = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	dnode_set_nlevels_impl(dn, nlevels, tx);
 
 out:
 	rw_exit(&dn->dn_struct_rwlock);
 	return (ret);
 }
 
 /* read-holding callers must not rely on the lock being continuously held */
 void
 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
     boolean_t force)
 {
 	int epbs, new_nlevels;
 	uint64_t sz;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	ASSERT(have_read ?
 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * if we have a read-lock, check to see if we need to do any work
 	 * before upgrading to a write-lock.
 	 */
 	if (have_read) {
 		if (blkid <= dn->dn_maxblkid)
 			return;
 
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 	}
 
 	/*
 	 * Raw sends (indicated by the force flag) require that we take the
 	 * given blkid even if the value is lower than the current value.
 	 */
 	if (!force && blkid <= dn->dn_maxblkid)
 		goto out;
 
 	/*
 	 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
 	 * to indicate that this field is set. This allows us to set the
 	 * maxblkid to 0 on an existing object in dnode_sync().
 	 */
 	dn->dn_maxblkid = blkid;
 	dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
 	    blkid | DMU_NEXT_MAXBLKID_SET;
 
 	/*
 	 * Compute the number of levels necessary to support the new maxblkid.
 	 * Raw sends will ensure nlevels is set correctly for us.
 	 */
 	new_nlevels = 1;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (sz = dn->dn_nblkptr;
 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
 		new_nlevels++;
 
 	ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
 
 	if (!force) {
 		if (new_nlevels > dn->dn_nlevels)
 			dnode_set_nlevels_impl(dn, new_nlevels, tx);
 	} else {
 		ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
 	}
 
 out:
 	if (have_read)
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
 static void
 dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
 	if (db != NULL) {
 		dmu_buf_will_dirty(&db->db, tx);
 		dbuf_rele(db, FTAG);
 	}
 }
 
 /*
  * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
  * and end_blkid.
  */
 static void
 dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db_search;
 	dmu_buf_impl_t *db;
 	avl_index_t where;
 
 	db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 
 	db_search->db_level = 1;
 	db_search->db_blkid = start_blkid + 1;
 	db_search->db_state = DB_SEARCH;
 	for (;;) {
 
 		db = avl_find(&dn->dn_dbufs, db_search, &where);
 		if (db == NULL)
 			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 		if (db == NULL || db->db_level != 1 ||
 		    db->db_blkid >= end_blkid) {
 			break;
 		}
 
 		/*
 		 * Setup the next blkid we want to search for.
 		 */
 		db_search->db_blkid = db->db_blkid + 1;
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/*
 		 * If the dbuf transitions to DB_EVICTING while we're trying
 		 * to dirty it, then we will be unable to discover it in
 		 * the dbuf hash table. This will result in a call to
 		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
 		 * lock. To avoid a deadlock, we drop the lock before
 		 * dirtying the level-1 dbuf.
 		 */
 		mutex_exit(&dn->dn_dbufs_mtx);
 		dnode_dirty_l1(dn, db->db_blkid, tx);
 		mutex_enter(&dn->dn_dbufs_mtx);
 	}
 
 #ifdef ZFS_DEBUG
 	/*
 	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
 	 */
 	db_search->db_level = 1;
 	db_search->db_blkid = start_blkid + 1;
 	db_search->db_state = DB_SEARCH;
 	db = avl_find(&dn->dn_dbufs, db_search, &where);
 	if (db == NULL)
 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_level != 1 || db->db_blkid >= end_blkid)
 			break;
 		if (db->db_state != DB_EVICTING)
 			ASSERT(db->db_dirtycnt > 0);
 	}
 #endif
 	kmem_free(db_search, sizeof (dmu_buf_impl_t));
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 void
 dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
 {
 	/*
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
 	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
 		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 
 		if (ds != NULL) {
 			rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
 		}
 		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
 			if (dmu_tx_is_syncing(tx))
 				dn->dn_dirtyctx = DN_DIRTY_SYNC;
 			else
 				dn->dn_dirtyctx = DN_DIRTY_OPEN;
 			dn->dn_dirtyctx_firstset = tag;
 		}
 		if (ds != NULL) {
 			rrw_exit(&ds->ds_bp_rwlock, tag);
 		}
 	}
 }
 
 static void
 dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	int res;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE,
 	    FTAG, &db);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (res == 0) {
 		db_lock_type_t dblt;
 		boolean_t dirty;
 
 		dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 		/* don't dirty if not on disk and not dirty */
 		dirty = !list_is_empty(&db->db_dirty_records) ||
 		    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
 		dmu_buf_unlock_parent(db, dblt, FTAG);
 		if (dirty) {
 			caddr_t data;
 
 			dmu_buf_will_dirty(&db->db, tx);
 			data = db->db.db_data;
 			memset(data + blkoff, 0, len);
 		}
 		dbuf_rele(db, FTAG);
 	}
 }
 
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
 	uint64_t blkoff, blkid, nblks;
 	int blksz, blkshift, head, tail;
 	int trunc = FALSE;
 	int epbs;
 
 	blksz = dn->dn_datablksz;
 	blkshift = dn->dn_datablkshift;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	if (len == DMU_OBJECT_END) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
 	}
 
 	/*
 	 * First, block align the region to free:
 	 */
 	if (ISP2(blksz)) {
 		head = P2NPHASE(off, blksz);
 		blkoff = P2PHASE(off, blksz);
 		if ((off >> blkshift) > dn->dn_maxblkid)
 			return;
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
 			/*
 			 * Freeing the whole block; fast-track this request.
 			 */
 			blkid = 0;
 			nblks = 1;
 			if (dn->dn_nlevels > 1) {
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				dnode_dirty_l1(dn, 0, tx);
 				rw_exit(&dn->dn_struct_rwlock);
 			}
 			goto done;
 		} else if (off >= blksz) {
 			/* Freeing past end-of-data */
 			return;
 		} else {
 			/* Freeing part of the block. */
 			head = blksz - off;
 			ASSERT3U(head, >, 0);
 		}
 		blkoff = off;
 	}
 	/* zero out any partial block data at the start of the range */
 	if (head) {
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
 		dnode_partial_zero(dn, off, blkoff, head, tx);
 		off += head;
 		len -= head;
 	}
 
 	/* If the range was less than one block, we're done */
 	if (len == 0)
 		return;
 
 	/* If the remaining range is past end of file, we're done */
 	if ((off >> blkshift) > dn->dn_maxblkid)
 		return;
 
 	ASSERT(ISP2(blksz));
 	if (trunc)
 		tail = 0;
 	else
 		tail = P2PHASE(len, blksz);
 
 	ASSERT0(P2PHASE(off, blksz));
 	/* zero out any partial block data at the end of the range */
 	if (tail) {
 		if (len < tail)
 			tail = len;
 		dnode_partial_zero(dn, off + len, 0, tail, tx);
 		len -= tail;
 	}
 
 	/* If the range did not include a full block, we are done */
 	if (len == 0)
 		return;
 
 	ASSERT(IS_P2ALIGNED(off, blksz));
 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
 	blkid = off >> blkshift;
 	nblks = len >> blkshift;
 	if (trunc)
 		nblks += 1;
 
 	/*
 	 * Dirty all the indirect blocks in this range.  Note that only
 	 * the first and last indirect blocks can actually be written
 	 * (if they were partially freed) -- they must be dirtied, even if
 	 * they do not exist on disk yet.  The interior blocks will
 	 * be freed by free_children(), so they will not actually be written.
 	 * Even though these interior blocks will not be written, we
 	 * dirty them for two reasons:
 	 *
 	 *  - It ensures that the indirect blocks remain in memory until
 	 *    syncing context.  (They have already been prefetched by
 	 *    dmu_tx_hold_free(), so we don't have to worry about reading
 	 *    them serially here.)
 	 *
 	 *  - The dirty space accounting will put pressure on the txg sync
 	 *    mechanism to begin syncing, and to delay transactions if there
 	 *    is a large amount of freeing.  Even though these indirect
 	 *    blocks will not be written, we could need to write the same
 	 *    amount of space if we copy the freed BPs into deadlists.
 	 */
 	if (dn->dn_nlevels > 1) {
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		uint64_t first, last;
 
 		first = blkid >> epbs;
 		dnode_dirty_l1(dn, first, tx);
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
 		if (last != first)
 			dnode_dirty_l1(dn, last, tx);
 
 		dnode_dirty_l1range(dn, first, last, tx);
 
 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
 		for (uint64_t i = first + 1; i < last; i++) {
 			/*
 			 * Set i to the blockid of the next non-hole
 			 * level-1 indirect block at or after i.  Note
 			 * that dnode_next_offset() operates in terms of
 			 * level-0-equivalent bytes.
 			 */
 			uint64_t ibyte = i << shift;
 			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
 			    &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (i >= last)
 				break;
 
 			/*
 			 * Normally we should not see an error, either
 			 * from dnode_next_offset() or dbuf_hold_level()
 			 * (except for ESRCH from dnode_next_offset).
 			 * If there is an i/o error, then when we read
 			 * this block in syncing context, it will use
 			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
 			 * to the "failmode" property.  dnode_next_offset()
 			 * doesn't have a flag to indicate MUSTSUCCEED.
 			 */
 			if (err != 0)
 				break;
 
 			dnode_dirty_l1(dn, i, tx);
 		}
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 done:
 	/*
 	 * Add this range to the dnode range list.
 	 * We will finish up this free operation in the syncing phase.
 	 */
 	mutex_enter(&dn->dn_mtx);
 	{
 		int txgoff = tx->tx_txg & TXG_MASK;
 		if (dn->dn_free_ranges[txgoff] == NULL) {
 			dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
 			    RANGE_SEG64, NULL, 0, 0);
 		}
 		range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 		range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
 	}
 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
 	    (u_longlong_t)blkid, (u_longlong_t)nblks,
 	    (u_longlong_t)tx->tx_txg);
 	mutex_exit(&dn->dn_mtx);
 
 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 }
 
 static boolean_t
 dnode_spill_freed(dnode_t *dn)
 {
 	int i;
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
 	int i;
 
 	if (blkid == DMU_BONUS_BLKID)
 		return (FALSE);
 
 	if (dn->dn_free_txg)
 		return (TRUE);
 
 	if (blkid == DMU_SPILL_BLKID)
 		return (dnode_spill_freed(dn));
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_free_ranges[i] != NULL &&
 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* call from syncing context when we actually write/free space for this dnode */
 void
 dnode_diduse_space(dnode_t *dn, int64_t delta)
 {
 	uint64_t space;
 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
 	    dn, dn->dn_phys,
 	    (u_longlong_t)dn->dn_phys->dn_used,
 	    (longlong_t)delta);
 
 	mutex_enter(&dn->dn_mtx);
 	space = DN_USED_BYTES(dn->dn_phys);
 	if (delta > 0) {
 		ASSERT3U(space + delta, >=, space); /* no overflow */
 	} else {
 		ASSERT3U(space, >=, -delta); /* no underflow */
 	}
 	space += delta;
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
 	} else {
 		dn->dn_phys->dn_used = space;
 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
 	}
 	mutex_exit(&dn->dn_mtx);
 }
 
 /*
  * Scans a block at the indicated "level" looking for a hole or data,
  * depending on 'flags'.
  *
  * If level > 0, then we are scanning an indirect block looking at its
  * pointers.  If level == 0, then we are looking at a block of dnodes.
  *
  * If we don't find what we are looking for in the block, we return ESRCH.
  * Otherwise, return with *offset pointing to the beginning (if searching
  * forwards) or end (if searching backwards) of the range covered by the
  * block pointer we matched on (or dnode).
  *
  * The basic search algorithm used below by dnode_next_offset() is to
  * use this function to search up the block tree (widen the search) until
  * we find something (i.e., we don't return ESRCH) and then search back
  * down the tree (narrow the search) until we reach our original search
  * level.
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
     int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	uint64_t epb = 1ULL << epbs;
 	uint64_t minfill, maxfill;
 	boolean_t hole;
 	int i, inc, error, span;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	hole = ((flags & DNODE_FIND_HOLE) != 0);
 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
 	ASSERT(txg == 0 || !hole);
 
 	if (lvl == dn->dn_phys->dn_nlevels) {
 		error = 0;
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
 			if (hole)
 				return (0);
 			/*
 			 * This can only happen when we are searching up
 			 * the block tree for data.  We don't really need to
 			 * adjust the offset, as we will just end up looking
 			 * at the pointer to this block in its parent, and its
 			 * going to be unallocated, so we will skip over it.
 			 */
 			return (SET_ERROR(ESRCH));
 		}
 		error = dbuf_read(db, NULL,
 		    DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
 		    DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
 		if (error) {
 			dbuf_rele(db, FTAG);
 			return (error);
 		}
 		data = db->db.db_data;
 		rw_enter(&db->db_rwlock, RW_READER);
 	}
 
 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
 	    db->db_blkptr->blk_birth <= txg ||
 	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
 		 * and these conditions mean that we need to keep climbing.
 		 */
 		error = SET_ERROR(ESRCH);
 	} else if (lvl == 0) {
 		dnode_phys_t *dnp = data;
 
 		ASSERT(dn->dn_type == DMU_OT_DNODE);
 		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
 
 		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
 		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
 				break;
 		}
 
 		if (i == blkfill)
 			error = SET_ERROR(ESRCH);
 
 		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
 		    (i << DNODE_SHIFT);
 	} else {
 		blkptr_t *bp = data;
 		uint64_t start = *offset;
 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
 		minfill = 0;
 		maxfill = blkfill << ((lvl - 1) * epbs);
 
 		if (hole)
 			maxfill--;
 		else
 			minfill++;
 
 		if (span >= 8 * sizeof (*offset)) {
 			/* This only happens on the highest indirection level */
 			ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
 			*offset = 0;
 		} else {
 			*offset = *offset >> span;
 		}
 
 		for (i = BF64_GET(*offset, 0, epbs);
 		    i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
 			    (hole || bp[i].blk_birth > txg))
 				break;
 			if (inc > 0 || *offset > 0)
 				*offset += inc;
 		}
 
 		if (span >= 8 * sizeof (*offset)) {
 			*offset = start;
 		} else {
 			*offset = *offset << span;
 		}
 
 		if (inc < 0) {
 			/* traversing backwards; position offset at the end */
 			if (span < 8 * sizeof (*offset))
 				*offset = MIN(*offset + (1ULL << span) - 1,
 				    start);
 		} else if (*offset < start) {
 			*offset = start;
 		}
 		if (i < 0 || i >= epb)
 			error = SET_ERROR(ESRCH);
 	}
 
 	if (db != NULL) {
 		rw_exit(&db->db_rwlock);
 		dbuf_rele(db, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Find the next hole, data, or sparse region at or after *offset.
  * The value 'blkfill' tells us how many items we expect to find
  * in an L0 data block; this value is 1 for normal objects,
  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
  * DNODES_PER_BLOCK when searching for sparse regions thereof.
  *
  * Examples:
  *
  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
  *	Finds the next/previous hole/data in a file.
  *	Used in dmu_offset_next().
  *
  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
  *	Finds the next free/allocated dnode an objset's meta-dnode.
  *	Only finds objects that have new contents since txg (ie.
  *	bonus buffer changes and content removal are ignored).
  *	Used in dmu_object_next().
  *
  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
  *	Used in dmu_object_alloc().
  */
 int
 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
 	uint64_t initial_offset = *offset;
 	int lvl, maxlvl;
 	int error = 0;
 
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dn->dn_phys->dn_nlevels == 0) {
 		error = SET_ERROR(ESRCH);
 		goto out;
 	}
 
 	if (dn->dn_datablkshift == 0) {
 		if (*offset < dn->dn_datablksz) {
 			if (flags & DNODE_FIND_HOLE)
 				*offset = dn->dn_datablksz;
 		} else {
 			error = SET_ERROR(ESRCH);
 		}
 		goto out;
 	}
 
 	maxlvl = dn->dn_phys->dn_nlevels;
 
 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 		if (error != ESRCH)
 			break;
 	}
 
 	while (error == 0 && --lvl >= minlvl) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 	}
 
 	/*
 	 * There's always a "virtual hole" at the end of the object, even
 	 * if all BP's which physically exist are non-holes.
 	 */
 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
 		error = 0;
 	}
 
 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
 	    initial_offset < *offset : initial_offset > *offset))
 		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
 
 	return (error);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dnode_hold);
 EXPORT_SYMBOL(dnode_rele);
 EXPORT_SYMBOL(dnode_set_nlevels);
 EXPORT_SYMBOL(dnode_set_blksz);
 EXPORT_SYMBOL(dnode_free_range);
 EXPORT_SYMBOL(dnode_evict_dbufs);
 EXPORT_SYMBOL(dnode_evict_bonus);
 #endif
 
 ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,
 	"Default dnode block shift");
 ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,
 	"Default dnode indirect block shift");
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 25eea0752941..66bc0ae60b10 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -1,747 +1,748 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
  */
 
 #include <sys/abd.h>
 #include <sys/mmp.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/time.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/callb.h>
 
 /*
  * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
  * or opening a pool on more than one host at a time.  In particular, it
  * prevents "zpool import -f" on a host from succeeding while the pool is
  * already imported on another host.  There are many other ways in which a
  * device could be used by two hosts for different purposes at the same time
  * resulting in pool damage.  This implementation does not attempt to detect
  * those cases.
  *
  * MMP operates by ensuring there are frequent visible changes on disk (a
  * "heartbeat") at all times.  And by altering the import process to check
  * for these changes and failing the import when they are detected.  This
  * functionality is enabled by setting the 'multihost' pool property to on.
  *
  * Uberblocks written by the txg_sync thread always go into the first
  * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
  * They are used to hold uberblocks which are exactly the same as the last
  * synced uberblock except that the ub_timestamp and mmp_config are frequently
  * updated.  Like all other uberblocks, the slot is written with an embedded
  * checksum, and slots with invalid checksums are ignored.  This provides the
  * "heartbeat", with no risk of overwriting good uberblocks that must be
  * preserved, e.g. previous txgs and associated block pointers.
  *
  * Three optional fields are added to uberblock structure; ub_mmp_magic,
  * ub_mmp_config, and ub_mmp_delay.  The ub_mmp_magic value allows zfs to tell
  * whether the other ub_mmp_* fields are valid.  The ub_mmp_config field tells
  * the importing host the settings of zfs_multihost_interval and
  * zfs_multihost_fail_intervals on the host which last had (or currently has)
  * the pool imported.  These determine how long a host must wait to detect
  * activity in the pool, before concluding the pool is not in use.  The
  * mmp_delay field is a decaying average of the amount of time between
  * completion of successive MMP writes, in nanoseconds.  It indicates whether
  * MMP is enabled.
  *
  * During import an activity test may now be performed to determine if
  * the pool is in use.  The activity test is typically required if the
  * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
  * POOL_STATE_ACTIVE, and the pool is not a root pool.
  *
  * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
  * ub_mmp_magic is valid, sequence number from ub_mmp_config).  It then waits
  * some time, and finds the "best" uberblock again.  If any of the mentioned
  * fields have different values in the newly read uberblock, the pool is in use
  * by another host and the import fails.  In order to assure the accuracy of the
  * activity test, the default values result in an activity test duration of 20x
  * the mmp write interval.
  *
  * The duration of the "zpool import" activity test depends on the information
  * available in the "best" uberblock:
  *
  * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
  *    ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
  *
  *    In this case, a weak guarantee is provided.  Since the host which last had
  *    the pool imported will suspend the pool if no mmp writes land within
  *    fail_intervals * multihost_interval ms, the absence of writes during that
  *    time means either the pool is not imported, or it is imported but the pool
  *    is suspended and no further writes will occur.
  *
  *    Note that resuming the suspended pool on the remote host would invalidate
  *    this guarantee, and so it is not allowed.
  *
  *    The factor of 2 provides a conservative safety factor and derives from
  *    MMP_IMPORT_SAFETY_FACTOR;
  *
  * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
  *    (ub_mmp_config.multihost_interval + ub_mmp_delay) *
  *        zfs_multihost_import_intervals
  *
  *    In this case no guarantee can provided.  However, as long as some devices
  *    are healthy and connected, it is likely that at least one write will land
  *    within (multihost_interval + mmp_delay) because multihost_interval is
  *    enough time for a write to be attempted to each leaf vdev, and mmp_delay
  *    is enough for one to land, based on past delays.  Multiplying by
  *    zfs_multihost_import_intervals provides a conservative safety factor.
  *
  * 3) If uberblock was written by zfs-0.7:
  *    (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
  *
  *    The same logic as case #2 applies, but we do not know remote tunables.
  *
  *    We use the local value for zfs_multihost_interval because the original MMP
  *    did not record this value in the uberblock.
  *
  *    ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
  *    has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
  *    that.  We will have waited enough time for zfs_multihost_import_intervals
  *    writes to be issued and all but one to land.
  *
  *    single device pool example delays
  *
  *    import_delay = (1 + 1) * 20   =  40s #defaults, no I/O delay
  *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
  *    import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
  *                                          no I/O delay
  *    100 device pool example delays
  *
  *    import_delay = (1 + .01) * 20 =  20s #defaults, no I/O delay
  *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
  *    import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
  *                                          no I/O delay
  *
  * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
  *    zfs_multihost_import_intervals * zfs_multihost_interval
  *
  *    In this case local tunables are used.  By default this product = 10s, long
  *    enough for a pool with any activity at all to write at least one
  *    uberblock.  No guarantee can be provided.
  *
  * Additionally, the duration is then extended by a random 25% to attempt to to
  * detect simultaneous imports.  For example, if both partner hosts are rebooted
  * at the same time and automatically attempt to import the pool.
  */
 
 /*
  * Used to control the frequency of mmp writes which are performed when the
  * 'multihost' pool property is on.  This is one factor used to determine the
  * length of the activity check during import.
  *
  * On average an mmp write will be issued for each leaf vdev every
  * zfs_multihost_interval milliseconds.  In practice, the observed period can
  * vary with the I/O load and this observed value is the ub_mmp_delay which is
  * stored in the uberblock.  The minimum allowed value is 100 ms.
  */
 uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
 
 /*
  * Used to control the duration of the activity test on import.  Smaller values
  * of zfs_multihost_import_intervals will reduce the import time but increase
  * the risk of failing to detect an active pool.  The total activity check time
  * is never allowed to drop below one second.  A value of 0 is ignored and
  * treated as if it was set to 1.
  */
 uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
 
 /*
  * Controls the behavior of the pool when mmp write failures or delays are
  * detected.
  *
  * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
  * ignored.  The failures will still be reported to the ZED which depending on
  * its configuration may take action such as suspending the pool or taking a
  * device offline.
  *
  * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
  * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
  * without a successful mmp write.  This guarantees the activity test will see
  * mmp writes if the pool is imported.  A value of 1 is ignored and treated as
  * if it was set to 2, because a single leaf vdev pool will issue a write once
  * per multihost_interval and thus any variation in latency would cause the
  * pool to be suspended.
  */
 uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
 
 static const void *const mmp_tag = "mmp_write_uberblock";
 static __attribute__((noreturn)) void mmp_thread(void *arg);
 
 void
 mmp_init(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	mmp->mmp_kstat_id = 1;
 }
 
 void
 mmp_fini(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_destroy(&mmp->mmp_thread_lock);
 	cv_destroy(&mmp->mmp_thread_cv);
 	mutex_destroy(&mmp->mmp_io_lock);
 }
 
 static void
 mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
 {
 	CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
 	mutex_enter(&mmp->mmp_thread_lock);
 }
 
 static void
 mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
 {
 	ASSERT(*mpp != NULL);
 	*mpp = NULL;
 	cv_broadcast(&mmp->mmp_thread_cv);
 	CALLB_CPR_EXIT(cpr);		/* drops &mmp->mmp_thread_lock */
 }
 
 void
 mmp_thread_start(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	if (spa_writeable(spa)) {
 		mutex_enter(&mmp->mmp_thread_lock);
 		if (!mmp->mmp_thread) {
 			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
 			    spa, 0, &p0, TS_RUN, defclsyspri);
 			zfs_dbgmsg("MMP thread started pool '%s' "
 			    "gethrtime %llu", spa_name(spa), gethrtime());
 		}
 		mutex_exit(&mmp->mmp_thread_lock);
 	}
 }
 
 void
 mmp_thread_stop(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_enter(&mmp->mmp_thread_lock);
 	mmp->mmp_thread_exiting = 1;
 	cv_broadcast(&mmp->mmp_thread_cv);
 
 	while (mmp->mmp_thread) {
 		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
 	}
 	mutex_exit(&mmp->mmp_thread_lock);
 	zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
 	    spa_name(spa), gethrtime());
 
 	ASSERT(mmp->mmp_thread == NULL);
 	mmp->mmp_thread_exiting = 0;
 }
 
 typedef enum mmp_vdev_state_flag {
 	MMP_FAIL_NOT_WRITABLE	= (1 << 0),
 	MMP_FAIL_WRITE_PENDING	= (1 << 1),
 } mmp_vdev_state_flag_t;
 
 /*
  * Find a leaf vdev to write an MMP block to.  It must not have an outstanding
  * mmp write (if so a new write will also likely block).  If there is no usable
  * leaf, a nonzero error value is returned. The error value returned is a bit
  * field.
  *
  * MMP_FAIL_WRITE_PENDING   One or more leaf vdevs are writeable, but have an
  *                          outstanding MMP write.
  * MMP_FAIL_NOT_WRITABLE    One or more leaf vdevs are not writeable.
  */
 
 static int
 mmp_next_leaf(spa_t *spa)
 {
 	vdev_t *leaf;
 	vdev_t *starting_leaf;
 	int fail_mask = 0;
 
 	ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
 	ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
 	ASSERT(!list_is_empty(&spa->spa_leaf_list));
 
 	if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
 		spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
 		spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
 	}
 
 	leaf = spa->spa_mmp.mmp_last_leaf;
 	if (leaf == NULL)
 		leaf = list_head(&spa->spa_leaf_list);
 	starting_leaf = leaf;
 
 	do {
 		leaf = list_next(&spa->spa_leaf_list, leaf);
 		if (leaf == NULL) {
 			leaf = list_head(&spa->spa_leaf_list);
 			ASSERT3P(leaf, !=, NULL);
 		}
 
 		/*
 		 * We skip unwritable, offline, detached, and dRAID spare
 		 * devices as they are either not legal targets or the write
 		 * may fail or not be seen by other hosts.  Skipped dRAID
 		 * spares can never be written so the fail mask is not set.
 		 */
 		if (!vdev_writeable(leaf) || leaf->vdev_offline ||
 		    leaf->vdev_detached) {
 			fail_mask |= MMP_FAIL_NOT_WRITABLE;
 		} else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
 			continue;
 		} else if (leaf->vdev_mmp_pending != 0) {
 			fail_mask |= MMP_FAIL_WRITE_PENDING;
 		} else {
 			spa->spa_mmp.mmp_last_leaf = leaf;
 			return (0);
 		}
 	} while (leaf != starting_leaf);
 
 	ASSERT(fail_mask);
 
 	return (fail_mask);
 }
 
 /*
  * MMP writes are issued on a fixed schedule, but may complete at variable,
  * much longer, intervals.  The mmp_delay captures long periods between
  * successful writes for any reason, including disk latency, scheduling delays,
  * etc.
  *
  * The mmp_delay is usually calculated as a decaying average, but if the latest
  * delay is higher we do not average it, so that we do not hide sudden spikes
  * which the importing host must wait for.
  *
  * If writes are occurring frequently, such as due to a high rate of txg syncs,
  * the mmp_delay could become very small.  Since those short delays depend on
  * activity we cannot count on, we never allow mmp_delay to get lower than rate
  * expected if only mmp_thread writes occur.
  *
  * If an mmp write was skipped or fails, and we have already waited longer than
  * mmp_delay, we need to update it so the next write reflects the longer delay.
  *
  * Do not set mmp_delay if the multihost property is not on, so as not to
  * trigger an activity check on import.
  */
 static void
 mmp_delay_update(spa_t *spa, boolean_t write_completed)
 {
 	mmp_thread_t *mts = &spa->spa_mmp;
 	hrtime_t delay = gethrtime() - mts->mmp_last_write;
 
 	ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
 
 	if (spa_multihost(spa) == B_FALSE) {
 		mts->mmp_delay = 0;
 		return;
 	}
 
 	if (delay > mts->mmp_delay)
 		mts->mmp_delay = delay;
 
 	if (write_completed == B_FALSE)
 		return;
 
 	mts->mmp_last_write = gethrtime();
 
 	/*
 	 * strictly less than, in case delay was changed above.
 	 */
 	if (delay < mts->mmp_delay) {
 		hrtime_t min_delay =
 		    MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
 		    MAX(1, vdev_count_leaves(spa));
 		mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
 		    min_delay);
 	}
 }
 
 static void
 mmp_write_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	mmp_thread_t *mts = zio->io_private;
 
 	mutex_enter(&mts->mmp_io_lock);
 	uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
 	hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
 
 	mmp_delay_update(spa, (zio->io_error == 0));
 
 	vd->vdev_mmp_pending = 0;
 	vd->vdev_mmp_kstat_id = 0;
 
 	mutex_exit(&mts->mmp_io_lock);
 	spa_config_exit(spa, SCL_STATE, mmp_tag);
 
 	spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error,
 	    mmp_write_duration);
 
 	abd_free(zio->io_abd);
 }
 
 /*
  * When the uberblock on-disk is updated by a spa_sync,
  * creating a new "best" uberblock, update the one stored
  * in the mmp thread state, used for mmp writes.
  */
 void
 mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_enter(&mmp->mmp_io_lock);
 	mmp->mmp_ub = *ub;
 	mmp->mmp_seq = 1;
 	mmp->mmp_ub.ub_timestamp = gethrestime_sec();
 	mmp_delay_update(spa, B_TRUE);
 	mutex_exit(&mmp->mmp_io_lock);
 }
 
 /*
  * Choose a random vdev, label, and MMP block, and write over it
  * with a copy of the last-synced uberblock, whose timestamp
  * has been updated to reflect that the pool is in use.
  */
 static void
 mmp_write_uberblock(spa_t *spa)
 {
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 	mmp_thread_t *mmp = &spa->spa_mmp;
 	uberblock_t *ub;
 	vdev_t *vd = NULL;
 	int label, error;
 	uint64_t offset;
 
 	hrtime_t lock_acquire_time = gethrtime();
 	spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
 	lock_acquire_time = gethrtime() - lock_acquire_time;
 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
 		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
 		    "gethrtime %llu", spa_name(spa), lock_acquire_time,
 		    gethrtime());
 
 	mutex_enter(&mmp->mmp_io_lock);
 
 	error = mmp_next_leaf(spa);
 
 	/*
 	 * spa_mmp_history has two types of entries:
 	 * Issued MMP write: records time issued, error status, etc.
 	 * Skipped MMP write: an MMP write could not be issued because no
 	 * suitable leaf vdev was available.  See comment above struct
 	 * spa_mmp_history for details.
 	 */
 
 	if (error) {
 		mmp_delay_update(spa, B_FALSE);
 		if (mmp->mmp_skip_error == error) {
 			spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
 		} else {
 			mmp->mmp_skip_error = error;
 			spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
 			    gethrestime_sec(), mmp->mmp_delay, NULL, 0,
 			    mmp->mmp_kstat_id++, error);
 			zfs_dbgmsg("MMP error choosing leaf pool '%s' "
 			    "gethrtime %llu fail_mask %#x", spa_name(spa),
 			    gethrtime(), error);
 		}
 		mutex_exit(&mmp->mmp_io_lock);
 		spa_config_exit(spa, SCL_STATE, mmp_tag);
 		return;
 	}
 
 	vd = spa->spa_mmp.mmp_last_leaf;
 	if (mmp->mmp_skip_error != 0) {
 		mmp->mmp_skip_error = 0;
 		zfs_dbgmsg("MMP write after skipping due to unavailable "
 		    "leaves, pool '%s' gethrtime %llu leaf %llu",
 		    spa_name(spa), (u_longlong_t)gethrtime(),
 		    (u_longlong_t)vd->vdev_guid);
 	}
 
 	if (mmp->mmp_zio_root == NULL)
 		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
 		    flags | ZIO_FLAG_GODFATHER);
 
 	if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
 		/*
 		 * Want to reset mmp_seq when timestamp advances because after
 		 * an mmp_seq wrap new values will not be chosen by
 		 * uberblock_compare() as the "best".
 		 */
 		mmp->mmp_ub.ub_timestamp = gethrestime_sec();
 		mmp->mmp_seq = 1;
 	}
 
 	ub = &mmp->mmp_ub;
 	ub->ub_mmp_magic = MMP_MAGIC;
 	ub->ub_mmp_delay = mmp->mmp_delay;
 	ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) |
 	    MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) |
 	    MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
 	    zfs_multihost_fail_intervals));
 	vd->vdev_mmp_pending = gethrtime();
 	vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
 
 	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
 
 	mmp->mmp_seq++;
 	mmp->mmp_kstat_id++;
 	mutex_exit(&mmp->mmp_io_lock);
 
 	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
 	    MMP_BLOCKS_PER_LABEL + random_in_range(MMP_BLOCKS_PER_LABEL));
 
 	label = random_in_range(VDEV_LABELS);
 	vdev_label_write(zio, vd, label, ub_abd, offset,
 	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
 	    flags | ZIO_FLAG_DONT_PROPAGATE);
 
 	(void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
 	    ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
 
 	zio_nowait(zio);
 }
 
 static __attribute__((noreturn)) void
 mmp_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
 	mmp_thread_t *mmp = &spa->spa_mmp;
 	boolean_t suspended = spa_suspended(spa);
 	boolean_t multihost = spa_multihost(spa);
 	uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
 	    zfs_multihost_interval));
 	uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
 	    zfs_multihost_fail_intervals);
 	hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
 	boolean_t last_spa_suspended;
 	boolean_t last_spa_multihost;
 	uint64_t last_mmp_interval;
 	uint32_t last_mmp_fail_intervals;
 	hrtime_t last_mmp_fail_ns;
 	callb_cpr_t cpr;
 	int skip_wait = 0;
 
 	mmp_thread_enter(mmp, &cpr);
 
 	/*
 	 * There have been no MMP writes yet.  Setting mmp_last_write here gives
 	 * us one mmp_fail_ns period, which is consistent with the activity
 	 * check duration, to try to land an MMP write before MMP suspends the
 	 * pool (if so configured).
 	 */
 
 	mutex_enter(&mmp->mmp_io_lock);
 	mmp->mmp_last_write = gethrtime();
 	mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
 	mutex_exit(&mmp->mmp_io_lock);
 
 	while (!mmp->mmp_thread_exiting) {
 		hrtime_t next_time = gethrtime() +
 		    MSEC2NSEC(MMP_DEFAULT_INTERVAL);
 		int leaves = MAX(vdev_count_leaves(spa), 1);
 
 		/* Detect changes in tunables or state */
 
 		last_spa_suspended = suspended;
 		last_spa_multihost = multihost;
 		suspended = spa_suspended(spa);
 		multihost = spa_multihost(spa);
 
 		last_mmp_interval = mmp_interval;
 		last_mmp_fail_intervals = mmp_fail_intervals;
 		last_mmp_fail_ns = mmp_fail_ns;
 		mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
 		    zfs_multihost_interval));
 		mmp_fail_intervals = MMP_FAIL_INTVS_OK(
 		    zfs_multihost_fail_intervals);
 
 		/* Smooth so pool is not suspended when reducing tunables */
 		if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
 			mmp_fail_ns = (mmp_fail_ns * 31 +
 			    mmp_fail_intervals * mmp_interval) / 32;
 		} else {
 			mmp_fail_ns = mmp_fail_intervals *
 			    mmp_interval;
 		}
 
 		if (mmp_interval != last_mmp_interval ||
 		    mmp_fail_intervals != last_mmp_fail_intervals) {
 			/*
 			 * We want other hosts to see new tunables as quickly as
 			 * possible.  Write out at higher frequency than usual.
 			 */
 			skip_wait += leaves;
 		}
 
 		if (multihost)
 			next_time = gethrtime() + mmp_interval / leaves;
 
 		if (mmp_fail_ns != last_mmp_fail_ns) {
 			zfs_dbgmsg("MMP interval change pool '%s' "
 			    "gethrtime %llu last_mmp_interval %llu "
 			    "mmp_interval %llu last_mmp_fail_intervals %u "
 			    "mmp_fail_intervals %u mmp_fail_ns %llu "
 			    "skip_wait %d leaves %d next_time %llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)last_mmp_interval,
 			    (u_longlong_t)mmp_interval, last_mmp_fail_intervals,
 			    mmp_fail_intervals, (u_longlong_t)mmp_fail_ns,
 			    skip_wait, leaves, (u_longlong_t)next_time);
 		}
 
 		/*
 		 * MMP off => on, or suspended => !suspended:
 		 * No writes occurred recently.  Update mmp_last_write to give
 		 * us some time to try.
 		 */
 		if ((!last_spa_multihost && multihost) ||
 		    (last_spa_suspended && !suspended)) {
 			zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
 			    "last_spa_multihost %u multihost %u "
 			    "last_spa_suspended %u suspended %u",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    last_spa_multihost, multihost, last_spa_suspended,
 			    suspended);
 			mutex_enter(&mmp->mmp_io_lock);
 			mmp->mmp_last_write = gethrtime();
 			mmp->mmp_delay = mmp_interval;
 			mutex_exit(&mmp->mmp_io_lock);
 		}
 
 		/*
 		 * MMP on => off:
 		 * mmp_delay == 0 tells importing node to skip activity check.
 		 */
 		if (last_spa_multihost && !multihost) {
 			mutex_enter(&mmp->mmp_io_lock);
 			mmp->mmp_delay = 0;
 			mutex_exit(&mmp->mmp_io_lock);
 		}
 
 		/*
 		 * Suspend the pool if no MMP write has succeeded in over
 		 * mmp_interval * mmp_fail_intervals nanoseconds.
 		 */
 		if (multihost && !suspended && mmp_fail_intervals &&
 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
 			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
 			    "mmp_last_write %llu mmp_interval %llu "
 			    "mmp_fail_intervals %llu mmp_fail_ns %llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)mmp->mmp_last_write,
 			    (u_longlong_t)mmp_interval,
 			    (u_longlong_t)mmp_fail_intervals,
 			    (u_longlong_t)mmp_fail_ns);
 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 			    "succeeded in over %llu ms; suspending pool. "
 			    "Hrtime %llu",
 			    spa_name(spa),
 			    NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
 			    gethrtime());
 			zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
 		}
 
 		if (multihost && !suspended)
 			mmp_write_uberblock(spa);
 
 		if (skip_wait > 0) {
 			next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
 			    leaves;
 			skip_wait--;
 		}
 
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv,
 		    &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
 		    CALLOUT_FLAG_ABSOLUTE);
 		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
 	}
 
 	/* Outstanding writes are allowed to complete. */
 	zio_wait(mmp->mmp_zio_root);
 
 	mmp->mmp_zio_root = NULL;
 	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
 
 	thread_exit();
 }
 
 /*
  * Signal the MMP thread to wake it, when it is sleeping on
  * its cv.  Used when some module parameter has changed and
  * we want the thread to know about it.
  * Only signal if the pool is active and mmp thread is
  * running, otherwise there is no thread to wake.
  */
 static void
 mmp_signal_thread(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_enter(&mmp->mmp_thread_lock);
 	if (mmp->mmp_thread)
 		cv_broadcast(&mmp->mmp_thread_cv);
 	mutex_exit(&mmp->mmp_thread_lock);
 }
 
 void
 mmp_signal_all_threads(void)
 {
 	spa_t *spa = NULL;
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa))) {
 		if (spa->spa_state == POOL_STATE_ACTIVE)
 			mmp_signal_thread(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
 	param_set_multihost_interval, spl_param_get_u64, ZMOD_RW,
 	"Milliseconds between mmp writes to each leaf");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW,
 	"Max allowed period without a successful mmp write");
 
 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW,
 	"Number of zfs_multihost_interval periods to wait for activity");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index e8f562a1a6a2..4c6501cc9a09 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -1,2084 +1,2086 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 /*
  * Virtual Device Labels
  * ---------------------
  *
  * The vdev label serves several distinct purposes:
  *
  *	1. Uniquely identify this device as part of a ZFS pool and confirm its
  *	   identity within the pool.
  *
  *	2. Verify that all the devices given in a configuration are present
  *         within the pool.
  *
  *	3. Determine the uberblock for the pool.
  *
  *	4. In case of an import operation, determine the configuration of the
  *         toplevel vdev of which it is a part.
  *
  *	5. If an import operation cannot find all the devices in the pool,
  *         provide enough information to the administrator to determine which
  *         devices are missing.
  *
  * It is important to note that while the kernel is responsible for writing the
  * label, it only consumes the information in the first three cases.  The
  * latter information is only consumed in userland when determining the
  * configuration to import a pool.
  *
  *
  * Label Organization
  * ------------------
  *
  * Before describing the contents of the label, it's important to understand how
  * the labels are written and updated with respect to the uberblock.
  *
  * When the pool configuration is altered, either because it was newly created
  * or a device was added, we want to update all the labels such that we can deal
  * with fatal failure at any point.  To this end, each disk has two labels which
  * are updated before and after the uberblock is synced.  Assuming we have
  * labels and an uberblock with the following transaction groups:
  *
  *              L1          UB          L2
  *           +------+    +------+    +------+
  *           |      |    |      |    |      |
  *           | t10  |    | t10  |    | t10  |
  *           |      |    |      |    |      |
  *           +------+    +------+    +------+
  *
  * In this stable state, the labels and the uberblock were all updated within
  * the same transaction group (10).  Each label is mirrored and checksummed, so
  * that we can detect when we fail partway through writing the label.
  *
  * In order to identify which labels are valid, the labels are written in the
  * following manner:
  *
  *	1. For each vdev, update 'L1' to the new label
  *	2. Update the uberblock
  *	3. For each vdev, update 'L2' to the new label
  *
  * Given arbitrary failure, we can determine the correct label to use based on
  * the transaction group.  If we fail after updating L1 but before updating the
  * UB, we will notice that L1's transaction group is greater than the uberblock,
  * so L2 must be valid.  If we fail after writing the uberblock but before
  * writing L2, we will notice that L2's transaction group is less than L1, and
  * therefore L1 is valid.
  *
  * Another added complexity is that not every label is updated when the config
  * is synced.  If we add a single device, we do not want to have to re-write
  * every label for every device in the pool.  This means that both L1 and L2 may
  * be older than the pool uberblock, because the necessary information is stored
  * on another vdev.
  *
  *
  * On-disk Format
  * --------------
  *
  * The vdev label consists of two distinct parts, and is wrapped within the
  * vdev_label_t structure.  The label includes 8k of padding to permit legacy
  * VTOC disk labels, but is otherwise ignored.
  *
  * The first half of the label is a packed nvlist which contains pool wide
  * properties, per-vdev properties, and configuration information.  It is
  * described in more detail below.
  *
  * The latter half of the label consists of a redundant array of uberblocks.
  * These uberblocks are updated whenever a transaction group is committed,
  * or when the configuration is updated.  When a pool is loaded, we scan each
  * vdev for the 'best' uberblock.
  *
  *
  * Configuration Information
  * -------------------------
  *
  * The nvlist describing the pool and vdev contains the following elements:
  *
  *	version		ZFS on-disk version
  *	name		Pool name
  *	state		Pool state
  *	txg		Transaction group in which this label was written
  *	pool_guid	Unique identifier for this pool
  *	vdev_tree	An nvlist describing vdev tree.
  *	features_for_read
  *			An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
  *	top_guid	Unique ID for top-level vdev in which this is contained
  *	guid		Unique ID for the leaf vdev
  *
  * The 'vs' configuration follows the format described in 'spa_config.c'.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/byteorder.h>
 #include <sys/zfs_bootenv.h>
 
 /*
  * Basic routines to read and write from a vdev label.
  * Used throughout the rest of this file.
  */
 uint64_t
 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 {
 	ASSERT(offset < sizeof (vdev_label_t));
 	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
 
 	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 }
 
 /*
  * Returns back the vdev label associated with the passed in offset.
  */
 int
 vdev_label_number(uint64_t psize, uint64_t offset)
 {
 	int l;
 
 	if (offset >= psize - VDEV_LABEL_END_SIZE) {
 		offset -= psize - VDEV_LABEL_END_SIZE;
 		offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
 	}
 	l = offset / sizeof (vdev_label_t);
 	return (l < VDEV_LABELS ? l : -1);
 }
 
 static void
 vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
 	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_read_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
 	    ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
 }
 
 void
 vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
 	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_write_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
 	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
 }
 
 /*
  * Generate the nvlist representing this vdev's stats
  */
 void
 vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 {
 	nvlist_t *nvx;
 	vdev_stat_t *vs;
 	vdev_stat_ex_t *vsx;
 
 	vs = kmem_alloc(sizeof (*vs), KM_SLEEP);
 	vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP);
 
 	vdev_get_stats_ex(vd, vs, vsx);
 	fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t));
 
 	/*
 	 * Add extended stats into a special extended stats nvlist.  This keeps
 	 * all the extended stats nicely grouped together.  The extended stats
 	 * nvlist is then added to the main nvlist.
 	 */
 	nvx = fnvlist_alloc();
 
 	/* ZIOs in flight to disk */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_REBUILD]);
 
 	/* ZIOs pending */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_REBUILD]);
 
 	/* Histograms */
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
 	    vsx->vsx_total_histo[ZIO_TYPE_READ],
 	    ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
 	    vsx->vsx_total_histo[ZIO_TYPE_WRITE],
 	    ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
 	    vsx->vsx_disk_histo[ZIO_TYPE_READ],
 	    ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
 	    vsx->vsx_disk_histo[ZIO_TYPE_WRITE],
 	    ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD]));
 
 	/* Request sizes */
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD]));
 
 	/* IO delays */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
 
 	/* Add extended stats nvlist to main nvlist */
 	fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
 
 	fnvlist_free(nvx);
 	kmem_free(vs, sizeof (*vs));
 	kmem_free(vsx, sizeof (*vsx));
 }
 
 static void
 root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd != spa->spa_root_vdev)
 		return;
 
 	/* provide either current or previous scan information */
 	pool_scan_stat_t ps;
 	if (spa_scan_get_stats(spa, &ps) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
 		    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
 	}
 
 	pool_removal_stat_t prs;
 	if (spa_removal_get_stats(spa, &prs) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
 		    sizeof (prs) / sizeof (uint64_t));
 	}
 
 	pool_checkpoint_stat_t pcs;
 	if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
 		    sizeof (pcs) / sizeof (uint64_t));
 	}
 
 	pool_raidz_expand_stat_t pres;
 	if (spa_raidz_expand_get_stats(spa, &pres) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres,
 		    sizeof (pres) / sizeof (uint64_t));
 	}
 }
 
 static void
 top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
 {
 	if (vd == vd->vdev_top) {
 		vdev_rebuild_stat_t vrs;
 		if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
 			fnvlist_add_uint64_array(nvl,
 			    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
 			    sizeof (vrs) / sizeof (uint64_t));
 		}
 	}
 }
 
 /*
  * Generate the nvlist representing this vdev's config.
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
     vdev_config_flag_t flags)
 {
 	nvlist_t *nv = NULL;
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 	nv = fnvlist_alloc();
 
 	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 
 	if (vd->vdev_path != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
 
 	if (vd->vdev_devid != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 
 	if (vd->vdev_physpath != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 		    vd->vdev_physpath);
 
 	if (vd->vdev_enc_sysfs_path != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 		    vd->vdev_enc_sysfs_path);
 
 	if (vd->vdev_fru != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
 
 	if (vd->vdev_ops->vdev_op_config_generate != NULL)
 		vd->vdev_ops->vdev_op_config_generate(vd, nv);
 
 	if (vd->vdev_wholedisk != -1ULL) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk);
 	}
 
 	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 
 	if (vd->vdev_isspare)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 
 	if (flags & VDEV_CONFIG_L2CACHE)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 	    vd == vd->vdev_top) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vd->vdev_ms_array);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vd->vdev_ms_shift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    vd->vdev_asize);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 		if (vd->vdev_noalloc) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
 			    vd->vdev_noalloc);
 		}
 
 		/*
 		 * Slog devices are removed synchronously so don't
 		 * persist the vdev_removing flag to the label.
 		 */
 		if (vd->vdev_removing && !vd->vdev_islog) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
 			    vd->vdev_removing);
 		}
 
 		/* zpool command expects alloc class data */
 		if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
 			const char *bias = NULL;
 
 			switch (vd->vdev_alloc_bias) {
 			case VDEV_BIAS_LOG:
 				bias = VDEV_ALLOC_BIAS_LOG;
 				break;
 			case VDEV_BIAS_SPECIAL:
 				bias = VDEV_ALLOC_BIAS_SPECIAL;
 				break;
 			case VDEV_BIAS_DEDUP:
 				bias = VDEV_ALLOC_BIAS_DEDUP;
 				break;
 			default:
 				ASSERT3U(vd->vdev_alloc_bias, ==,
 				    VDEV_BIAS_NONE);
 			}
 			fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 			    bias);
 		}
 	}
 
 	if (vd->vdev_dtl_sm != NULL) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
 		    space_map_object(vd->vdev_dtl_sm));
 	}
 
 	if (vic->vic_mapping_object != 0) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 		    vic->vic_mapping_object);
 	}
 
 	if (vic->vic_births_object != 0) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 		    vic->vic_births_object);
 	}
 
 	if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 		    vic->vic_prev_indirect_vdev);
 	}
 
 	if (vd->vdev_crtxg)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 
 	if (vd->vdev_expansion_time)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME,
 		    vd->vdev_expansion_time);
 
 	if (flags & VDEV_CONFIG_MOS) {
 		if (vd->vdev_leaf_zap != 0) {
 			ASSERT(vd->vdev_ops->vdev_op_leaf);
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
 			    vd->vdev_leaf_zap);
 		}
 
 		if (vd->vdev_top_zap != 0) {
 			ASSERT(vd == vd->vdev_top);
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 			    vd->vdev_top_zap);
 		}
 
 		if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 &&
 		    spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
 			    vd->vdev_root_zap);
 		}
 
 		if (vd->vdev_resilver_deferred) {
 			ASSERT(vd->vdev_ops->vdev_op_leaf);
 			ASSERT(spa->spa_resilver_deferred);
 			fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
 		}
 	}
 
 	if (getstats) {
 		vdev_config_generate_stats(vd, nv);
 
 		root_vdev_actions_getprogress(vd, nv);
 		top_vdev_actions_getprogress(vd, nv);
 
 		/*
 		 * Note: this can be called from open context
 		 * (spa_get_stats()), so we need the rwlock to prevent
 		 * the mapping from being changed by condensing.
 		 */
 		rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
 		if (vd->vdev_indirect_mapping != NULL) {
 			ASSERT(vd->vdev_indirect_births != NULL);
 			vdev_indirect_mapping_t *vim =
 			    vd->vdev_indirect_mapping;
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 			    vdev_indirect_mapping_size(vim));
 		}
 		rw_exit(&vd->vdev_indirect_rwlock);
 		if (vd->vdev_mg != NULL &&
 		    vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
 			/*
 			 * Compute approximately how much memory would be used
 			 * for the indirect mapping if this device were to
 			 * be removed.
 			 *
 			 * Note: If the frag metric is invalid, then not
 			 * enough metaslabs have been converted to have
 			 * histograms.
 			 */
 			uint64_t seg_count = 0;
 			uint64_t to_alloc = vd->vdev_stat.vs_alloc;
 
 			/*
 			 * There are the same number of allocated segments
 			 * as free segments, so we will have at least one
 			 * entry per free segment.  However, small free
 			 * segments (smaller than vdev_removal_max_span)
 			 * will be combined with adjacent allocated segments
 			 * as a single mapping.
 			 */
 			for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 				if (i + 1 < highbit64(vdev_removal_max_span)
 				    - 1) {
 					to_alloc +=
 					    vd->vdev_mg->mg_histogram[i] <<
 					    (i + 1);
 				} else {
 					seg_count +=
 					    vd->vdev_mg->mg_histogram[i];
 				}
 			}
 
 			/*
 			 * The maximum length of a mapping is
 			 * zfs_remove_max_segment, so we need at least one entry
 			 * per zfs_remove_max_segment of allocated data.
 			 */
 			seg_count += to_alloc / spa_remove_max_segment(spa);
 
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 			    seg_count *
 			    sizeof (vdev_indirect_mapping_entry_phys_t));
 		}
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t **child;
 		uint64_t c;
 
 		ASSERT(!vd->vdev_ishole);
 
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
 		for (c = 0; c < vd->vdev_children; c++) {
 			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
 			    getstats, flags);
 		}
 
 		fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 		    (const nvlist_t * const *)child, vd->vdev_children);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			nvlist_free(child[c]);
 
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
 
 	} else {
 		const char *aux = NULL;
 
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
 		if (vd->vdev_resilver_txg != 0)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 			    vd->vdev_resilver_txg);
 		if (vd->vdev_rebuild_txg != 0)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 			    vd->vdev_rebuild_txg);
 		if (vd->vdev_faulted)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
 		if (vd->vdev_degraded)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
 		if (vd->vdev_removed)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
 		if (vd->vdev_unspare)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
 		if (vd->vdev_ishole)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
 
 		/* Set the reason why we're FAULTED/DEGRADED. */
 		switch (vd->vdev_stat.vs_aux) {
 		case VDEV_AUX_ERR_EXCEEDED:
 			aux = "err_exceeded";
 			break;
 
 		case VDEV_AUX_EXTERNAL:
 			aux = "external";
 			break;
 		}
 
 		if (aux != NULL && !vd->vdev_tmpoffline) {
 			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
 		} else {
 			/*
 			 * We're healthy - clear any previous AUX_STATE values.
 			 */
 			if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE))
 				nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE);
 		}
 
 		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
 			    vd->vdev_orig_guid);
 		}
 	}
 
 	return (nv);
 }
 
 /*
  * Generate a view of the top-level vdevs.  If we currently have holes
  * in the namespace, then generate an array which contains a list of holey
  * vdevs.  Additionally, add the number of top-level children that currently
  * exist.
  */
 void
 vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *array;
 	uint_t c, idx;
 
 	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
 
 	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_ishole) {
 			array[idx++] = c;
 		}
 	}
 
 	if (idx) {
 		VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
 		    array, idx) == 0);
 	}
 
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 	    rvd->vdev_children) == 0);
 
 	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 }
 
 /*
  * Returns the configuration from the label of the given vdev. For vdevs
  * which don't have a txg value stored on their label (i.e. spares/cache)
  * or have not been completely initialized (txg = 0) just return
  * the configuration from the first valid label we find. Otherwise,
  * find the most up-to-date label that does not exceed the specified
  * 'txg' value.
  */
 nvlist_t *
 vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp[VDEV_LABELS];
 	abd_t *vp_abd[VDEV_LABELS];
 	zio_t *zio[VDEV_LABELS];
 	uint64_t best_txg = 0;
 	uint64_t label_txg = 0;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(vd->vdev_validate_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (!vdev_readable(vd))
 		return (NULL);
 
 	/*
 	 * The label for a dRAID distributed spare is not stored on disk.
 	 * Instead it is generated when needed which allows us to bypass
 	 * the pipeline when reading the config from the label.
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (vdev_draid_read_config_spare(vd));
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 		vp[l] = abd_to_buf(vp_abd[l]);
 	}
 
 retry:
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zio[l] = zio_root(spa, NULL, NULL, flags);
 
 		vdev_label_read(zio[l], vd, l, vp_abd[l],
 		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
 		    NULL, NULL, flags);
 	}
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		nvlist_t *label = NULL;
 
 		if (zio_wait(zio[l]) == 0 &&
 		    nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist),
 		    &label, 0) == 0) {
 			/*
 			 * Auxiliary vdevs won't have txg values in their
 			 * labels and newly added vdevs may not have been
 			 * completely initialized so just return the
 			 * configuration from the first valid label we
 			 * encounter.
 			 */
 			error = nvlist_lookup_uint64(label,
 			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
 			if ((error || label_txg == 0) && !config) {
 				config = label;
 				for (l++; l < VDEV_LABELS; l++)
 					zio_wait(zio[l]);
 				break;
 			} else if (label_txg <= txg && label_txg > best_txg) {
 				best_txg = label_txg;
 				nvlist_free(config);
 				config = fnvlist_dup(label);
 			}
 		}
 
 		if (label != NULL) {
 			nvlist_free(label);
 			label = NULL;
 		}
 	}
 
 	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	/*
 	 * We found a valid label but it didn't pass txg restrictions.
 	 */
 	if (config == NULL && label_txg != 0) {
 		vdev_dbgmsg(vd, "label discarded as txg is too large "
 		    "(%llu > %llu)", (u_longlong_t)label_txg,
 		    (u_longlong_t)txg);
 	}
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		abd_free(vp_abd[l]);
 	}
 
 	return (config);
 }
 
 /*
  * Determine if a device is in use.  The 'spare_guid' parameter will be filled
  * in with the device guid if this spare is active elsewhere on the system.
  */
 static boolean_t
 vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
     uint64_t *spare_guid, uint64_t *l2cache_guid)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t state, pool_guid, device_guid, txg, spare_pool;
 	uint64_t vdtxg = 0;
 	nvlist_t *label;
 
 	if (spare_guid)
 		*spare_guid = 0ULL;
 	if (l2cache_guid)
 		*l2cache_guid = 0ULL;
 
 	/*
 	 * Read the label, if any, and perform some basic sanity checks.
 	 */
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
 		return (B_FALSE);
 
 	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 	    &vdtxg);
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0 ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 	    &device_guid) != 0) {
 		nvlist_free(label);
 		return (B_FALSE);
 	}
 
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 	    &pool_guid) != 0 ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
 	    &txg) != 0)) {
 		nvlist_free(label);
 		return (B_FALSE);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Check to see if this device indeed belongs to the pool it claims to
 	 * be a part of.  The only way this is allowed is if the device is a hot
 	 * spare (which we check for later on).
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    !spa_guid_exists(pool_guid, device_guid) &&
 	    !spa_spare_exists(device_guid, NULL, NULL) &&
 	    !spa_l2cache_exists(device_guid, NULL))
 		return (B_FALSE);
 
 	/*
 	 * If the transaction group is zero, then this an initialized (but
 	 * unused) label.  This is only an error if the create transaction
 	 * on-disk is the same as the one we're using now, in which case the
 	 * user has attempted to add the same vdev multiple times in the same
 	 * transaction.
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    txg == 0 && vdtxg == crtxg)
 		return (B_TRUE);
 
 	/*
 	 * Check to see if this is a spare device.  We do an explicit check for
 	 * spa_has_spare() here because it may be on our pending list of spares
 	 * to add.
 	 */
 	if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
 	    spa_has_spare(spa, device_guid)) {
 		if (spare_guid)
 			*spare_guid = device_guid;
 
 		switch (reason) {
 		case VDEV_LABEL_CREATE:
 			return (B_TRUE);
 
 		case VDEV_LABEL_REPLACE:
 			return (!spa_has_spare(spa, device_guid) ||
 			    spare_pool != 0ULL);
 
 		case VDEV_LABEL_SPARE:
 			return (spa_has_spare(spa, device_guid));
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * Check to see if this is an l2cache device.
 	 */
 	if (spa_l2cache_exists(device_guid, NULL) ||
 	    spa_has_l2cache(spa, device_guid)) {
 		if (l2cache_guid)
 			*l2cache_guid = device_guid;
 
 		switch (reason) {
 		case VDEV_LABEL_CREATE:
 			return (B_TRUE);
 
 		case VDEV_LABEL_REPLACE:
 			return (!spa_has_l2cache(spa, device_guid));
 
 		case VDEV_LABEL_L2CACHE:
 			return (spa_has_l2cache(spa, device_guid));
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * We can't rely on a pool's state if it's been imported
 	 * read-only.  Instead we look to see if the pools is marked
 	 * read-only in the namespace and set the state to active.
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
 	    spa_mode(spa) == SPA_MODE_READ)
 		state = POOL_STATE_ACTIVE;
 
 	/*
 	 * If the device is marked ACTIVE, then this device is in use by another
 	 * pool on the system.
 	 */
 	return (state == POOL_STATE_ACTIVE);
 }
 
 /*
  * Initialize a vdev label.  We check to make sure each leaf device is not in
  * use, and writable.  We put down an initial label which we will later
  * overwrite with a complete label.  Note that it's important to do this
  * sequentially, not in parallel, so that we catch cases of multiple use of the
  * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
  * itself.
  */
 int
 vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	abd_t *vp_abd;
 	abd_t *bootenv;
 	uberblock_t *ub;
 	abd_t *ub_abd;
 	zio_t *zio;
 	char *buf;
 	size_t buflen;
 	int error;
 	uint64_t spare_guid = 0, l2cache_guid = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((error = vdev_label_init(vd->vdev_child[c],
 		    crtxg, reason)) != 0)
 			return (error);
 
 	/* Track the creation time for this vdev */
 	vd->vdev_crtxg = crtxg;
 
 	if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
 		return (0);
 
 	/*
 	 * Dead vdevs cannot be initialized.
 	 */
 	if (vdev_is_dead(vd))
 		return (SET_ERROR(EIO));
 
 	/*
 	 * Determine if the vdev is in use.
 	 */
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
 	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * If this is a request to add or replace a spare or l2cache device
 	 * that is in use elsewhere on the system, then we must update the
 	 * guid (which was initialized to a random value) to reflect the
 	 * actual GUID (which is shared between multiple pools).
 	 */
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
 	    spare_guid != 0ULL) {
 		uint64_t guid_delta = spare_guid - vd->vdev_guid;
 
 		vd->vdev_guid += guid_delta;
 
 		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 			pvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If this is a replacement, then we want to fallthrough to the
 		 * rest of the code.  If we're adding a spare, then it's already
 		 * labeled appropriately and we can just return.
 		 */
 		if (reason == VDEV_LABEL_SPARE)
 			return (0);
 		ASSERT(reason == VDEV_LABEL_REPLACE ||
 		    reason == VDEV_LABEL_SPLIT);
 	}
 
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
 	    l2cache_guid != 0ULL) {
 		uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
 
 		vd->vdev_guid += guid_delta;
 
 		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 			pvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If this is a replacement, then we want to fallthrough to the
 		 * rest of the code.  If we're adding an l2cache, then it's
 		 * already labeled appropriately and we can just return.
 		 */
 		if (reason == VDEV_LABEL_L2CACHE)
 			return (0);
 		ASSERT(reason == VDEV_LABEL_REPLACE);
 	}
 
 	/*
 	 * Initialize its label.
 	 */
 	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 	abd_zero(vp_abd, sizeof (vdev_phys_t));
 	vp = abd_to_buf(vp_abd);
 
 	/*
 	 * Generate a label describing the pool and our top-level vdev.
 	 * We mark it as being from txg 0 to indicate that it's not
 	 * really part of an active pool just yet.  The labels will
 	 * be written again with a meaningful txg by spa_sync().
 	 */
 	if (reason == VDEV_LABEL_SPARE ||
 	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
 		/*
 		 * For inactive hot spares, we generate a special label that
 		 * identifies as a mutually shared hot spare.  We write the
 		 * label if we are adding a hot spare, or if we are removing an
 		 * active hot spare (in which case we want to revert the
 		 * labels).
 		 */
 		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
 		    spa_version(spa)) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    POOL_STATE_SPARE) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
 		    vd->vdev_guid) == 0);
 	} else if (reason == VDEV_LABEL_L2CACHE ||
 	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
 		/*
 		 * For level 2 ARC devices, add a special label.
 		 */
 		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
 		    spa_version(spa)) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    POOL_STATE_L2CACHE) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
 		    vd->vdev_guid) == 0);
 
 		/*
 		 * This is merely to facilitate reporting the ashift of the
 		 * cache device through zdb. The actual retrieval of the
 		 * ashift (in vdev_alloc()) uses the nvlist
 		 * spa->spa_l2cache->sav_config (populated in
 		 * spa_ld_open_aux_vdevs()).
 		 */
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT,
 		    vd->vdev_ashift) == 0);
 	} else {
 		uint64_t txg = 0ULL;
 
 		if (reason == VDEV_LABEL_SPLIT)
 			txg = spa->spa_uberblock.ub_txg;
 		label = spa_config_generate(spa, vd, txg, B_FALSE);
 
 		/*
 		 * Add our creation time.  This allows us to detect multiple
 		 * vdev uses as described above, and automatically expires if we
 		 * fail.
 		 */
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 		    crtxg) == 0);
 	}
 
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
 	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
 	if (error != 0) {
 		nvlist_free(label);
 		abd_free(vp_abd);
 		/* EFAULT means nvlist_pack ran out of room */
 		return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL));
 	}
 
 	/*
 	 * Initialize uberblock template.
 	 */
 	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
 	abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_RING - sizeof (uberblock_t));
 	ub = abd_to_buf(ub_abd);
 	ub->ub_txg = 0;
 
 	/* Initialize the 2nd padding area. */
 	bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
 	abd_zero(bootenv, VDEV_PAD_SIZE);
 
 	/*
 	 * Write everything in parallel.
 	 */
 retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 
 		vdev_label_write(zio, vd, l, vp_abd,
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
 		/*
 		 * Skip the 1st padding area.
 		 * Zero out the 2nd padding area where it might have
 		 * left over data from previous filesystem format.
 		 */
 		vdev_label_write(zio, vd, l, bootenv,
 		    offsetof(vdev_label_t, vl_be),
 		    VDEV_PAD_SIZE, NULL, NULL, flags);
 
 		vdev_label_write(zio, vd, l, ub_abd,
 		    offsetof(vdev_label_t, vl_uberblock),
 		    VDEV_UBERBLOCK_RING, NULL, NULL, flags);
 	}
 
 	error = zio_wait(zio);
 
 	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	nvlist_free(label);
 	abd_free(bootenv);
 	abd_free(ub_abd);
 	abd_free(vp_abd);
 
 	/*
 	 * If this vdev hasn't been previously identified as a spare, then we
 	 * mark it as such only if a) we are labeling it as a spare, or b) it
 	 * exists as a spare elsewhere in the system.  Do the same for
 	 * level 2 ARC devices.
 	 */
 	if (error == 0 && !vd->vdev_isspare &&
 	    (reason == VDEV_LABEL_SPARE ||
 	    spa_spare_exists(vd->vdev_guid, NULL, NULL)))
 		spa_spare_add(vd);
 
 	if (error == 0 && !vd->vdev_isl2cache &&
 	    (reason == VDEV_LABEL_L2CACHE ||
 	    spa_l2cache_exists(vd->vdev_guid, NULL)))
 		spa_l2cache_add(vd);
 
 	return (error);
 }
 
 /*
  * Done callback for vdev_label_read_bootenv_impl. If this is the first
  * callback to finish, store our abd in the callback pointer. Otherwise, we
  * just free our abd and return.
  */
 static void
 vdev_label_read_bootenv_done(zio_t *zio)
 {
 	zio_t *rio = zio->io_private;
 	abd_t **cbp = rio->io_private;
 
 	ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE);
 
 	if (zio->io_error == 0) {
 		mutex_enter(&rio->io_lock);
 		if (*cbp == NULL) {
 			/* Will free this buffer in vdev_label_read_bootenv. */
 			*cbp = zio->io_abd;
 		} else {
 			abd_free(zio->io_abd);
 		}
 		mutex_exit(&rio->io_lock);
 	} else {
 		abd_free(zio->io_abd);
 	}
 }
 
 static void
 vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags);
 
 	/*
 	 * We just use the first label that has a correct checksum; the
 	 * bootloader should have rewritten them all to be the same on boot,
 	 * and any changes we made since boot have been the same across all
 	 * labels.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		for (int l = 0; l < VDEV_LABELS; l++) {
 			vdev_label_read(zio, vd, l,
 			    abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
 			    offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
 			    vdev_label_read_bootenv_done, zio, flags);
 		}
 	}
 }
 
 int
 vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
 {
 	nvlist_t *config;
 	spa_t *spa = rvd->vdev_spa;
 	abd_t *abd = NULL;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 
 	ASSERT(bootenv);
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	zio_t *zio = zio_root(spa, NULL, &abd, flags);
 	vdev_label_read_bootenv_impl(zio, rvd, flags);
 	int err = zio_wait(zio);
 
 	if (abd != NULL) {
 		char *buf;
 		vdev_boot_envblock_t *vbe = abd_to_buf(abd);
 
 		vbe->vbe_version = ntohll(vbe->vbe_version);
 		switch (vbe->vbe_version) {
 		case VB_RAW:
 			/*
 			 * if we have textual data in vbe_bootenv, create nvlist
 			 * with key "envmap".
 			 */
 			fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW);
 			vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
 			fnvlist_add_string(bootenv, GRUB_ENVMAP,
 			    vbe->vbe_bootenv);
 			break;
 
 		case VB_NVLIST:
 			err = nvlist_unpack(vbe->vbe_bootenv,
 			    sizeof (vbe->vbe_bootenv), &config, 0);
 			if (err == 0) {
 				fnvlist_merge(bootenv, config);
 				nvlist_free(config);
 				break;
 			}
 			zfs_fallthrough;
 		default:
 			/* Check for FreeBSD zfs bootonce command string */
 			buf = abd_to_buf(abd);
 			if (*buf == '\0') {
 				fnvlist_add_uint64(bootenv, BOOTENV_VERSION,
 				    VB_NVLIST);
 				break;
 			}
 			fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
 		}
 
 		/*
 		 * abd was allocated in vdev_label_read_bootenv_impl()
 		 */
 		abd_free(abd);
 		/*
 		 * If we managed to read any successfully,
 		 * return success.
 		 */
 		return (0);
 	}
 	return (err);
 }
 
 int
 vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
 {
 	zio_t *zio;
 	spa_t *spa = vd->vdev_spa;
 	vdev_boot_envblock_t *bootenv;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 	int error;
 	size_t nvsize;
 	char *nvbuf;
 	const char *tmp;
 
 	error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	if (nvsize >= sizeof (bootenv->vbe_bootenv)) {
 		return (SET_ERROR(E2BIG));
 	}
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	error = ENXIO;
 	for (int c = 0; c < vd->vdev_children; c++) {
 		int child_err;
 
 		child_err = vdev_label_write_bootenv(vd->vdev_child[c], env);
 		/*
 		 * As long as any of the disks managed to write all of their
 		 * labels successfully, return success.
 		 */
 		if (child_err == 0)
 			error = child_err;
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) ||
 	    !vdev_writeable(vd)) {
 		return (error);
 	}
 	ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
 	abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
 	abd_zero(abd, VDEV_PAD_SIZE);
 
 	bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
 	nvbuf = bootenv->vbe_bootenv;
 	nvsize = sizeof (bootenv->vbe_bootenv);
 
 	bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
 	switch (bootenv->vbe_version) {
 	case VB_RAW:
 		if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) {
 			(void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize);
 		}
 		error = 0;
 		break;
 
 	case VB_NVLIST:
 		error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR,
 		    KM_SLEEP);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (error == 0) {
 		bootenv->vbe_version = htonll(bootenv->vbe_version);
 		abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
 	} else {
 		abd_free(abd);
 		return (SET_ERROR(error));
 	}
 
 retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		vdev_label_write(zio, vd, l, abd,
 		    offsetof(vdev_label_t, vl_be),
 		    VDEV_PAD_SIZE, NULL, NULL, flags);
 	}
 
 	error = zio_wait(zio);
 	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	abd_free(abd);
 	return (error);
 }
 
 /*
  * ==========================================================================
  * uberblock load/sync
  * ==========================================================================
  */
 
 /*
  * Consider the following situation: txg is safely synced to disk.  We've
  * written the first uberblock for txg + 1, and then we lose power.  When we
  * come back up, we fail to see the uberblock for txg + 1 because, say,
  * it was on a mirrored device and the replica to which we wrote txg + 1
  * is now offline.  If we then make some changes and sync txg + 1, and then
  * the missing replica comes back, then for a few seconds we'll have two
  * conflicting uberblocks on disk with the same txg.  The solution is simple:
  * among uberblocks with equal txg, choose the one with the latest timestamp.
  */
 static int
 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
 {
 	int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg);
 
 	if (likely(cmp))
 		return (cmp);
 
 	cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
 	if (likely(cmp))
 		return (cmp);
 
 	/*
 	 * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware
 	 * ZFS, e.g. OpenZFS >= 0.7.
 	 *
 	 * If one ub has MMP and the other does not, they were written by
 	 * different hosts, which matters for MMP.  So we treat no MMP/no SEQ as
 	 * a 0 value.
 	 *
 	 * Since timestamp and txg are the same if we get this far, either is
 	 * acceptable for importing the pool.
 	 */
 	unsigned int seq1 = 0;
 	unsigned int seq2 = 0;
 
 	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
 		seq1 = MMP_SEQ(ub1);
 
 	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
 		seq2 = MMP_SEQ(ub2);
 
 	return (TREE_CMP(seq1, seq2));
 }
 
 struct ubl_cbdata {
 	uberblock_t	ubl_latest;	/* Most recent uberblock */
 	uberblock_t	*ubl_ubbest;	/* Best uberblock (w/r/t max_txg) */
 	vdev_t		*ubl_vd;	/* vdev associated with the above */
 };
 
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
 	uberblock_t *ub = abd_to_buf(zio->io_abd);
 	struct ubl_cbdata *cbp = rio->io_private;
 
 	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
 		if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) {
 			cbp->ubl_latest = *ub;
 		}
 		if (ub->ub_txg <= spa->spa_load_max_txg &&
 		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
 			/*
 			 * Keep track of the vdev in which this uberblock
 			 * was found. We will use this information later
 			 * to obtain the config nvlist associated with
 			 * this uberblock.
 			 */
 			*cbp->ubl_ubbest = *ub;
 			cbp->ubl_vd = vd;
 		}
 		mutex_exit(&rio->io_lock);
 	}
 
 	abd_free(zio->io_abd);
 }
 
 static void
 vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
     struct ubl_cbdata *cbp)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		for (int l = 0; l < VDEV_LABELS; l++) {
 			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 				vdev_label_read(zio, vd, l,
 				    abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
 				    B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
 				    VDEV_UBERBLOCK_SIZE(vd),
 				    vdev_uberblock_load_done, zio, flags);
 			}
 		}
 	}
 }
 
 /*
  * Reads the 'best' uberblock from disk along with its associated
  * configuration. First, we read the uberblock array of each label of each
  * vdev, keeping track of the uberblock with the highest txg in each array.
  * Then, we read the configuration from the same vdev as the best uberblock.
  */
 void
 vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 {
 	zio_t *zio;
 	spa_t *spa = rvd->vdev_spa;
 	struct ubl_cbdata cb;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 
 	ASSERT(ub);
 	ASSERT(config);
 
 	memset(ub, 0, sizeof (uberblock_t));
 	memset(&cb, 0, sizeof (cb));
 	*config = NULL;
 
 	cb.ubl_ubbest = ub;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	zio = zio_root(spa, NULL, &cb, flags);
 	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
 	(void) zio_wait(zio);
 
 	/*
 	 * It's possible that the best uberblock was discovered on a label
 	 * that has a configuration which was written in a future txg.
 	 * Search all labels on this vdev to find the configuration that
 	 * matches the txg for our uberblock.
 	 */
 	if (cb.ubl_vd != NULL) {
 		vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
 		    "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
 
 		if (ub->ub_raidz_reflow_info !=
 		    cb.ubl_latest.ub_raidz_reflow_info) {
 			vdev_dbgmsg(cb.ubl_vd,
 			    "spa=%s best uberblock (txg=%llu info=0x%llx) "
 			    "has different raidz_reflow_info than latest "
 			    "uberblock (txg=%llu info=0x%llx)",
 			    spa->spa_name,
 			    (u_longlong_t)ub->ub_txg,
 			    (u_longlong_t)ub->ub_raidz_reflow_info,
 			    (u_longlong_t)cb.ubl_latest.ub_txg,
 			    (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info);
 			memset(ub, 0, sizeof (uberblock_t));
 			spa_config_exit(spa, SCL_ALL, FTAG);
 			return;
 		}
 
 		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
 		if (*config == NULL && spa->spa_extreme_rewind) {
 			vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
 			    "Trying again without txg restrictions.");
 			*config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
 		}
 		if (*config == NULL) {
 			vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
 		}
 	}
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * For use when a leaf vdev is expanded.
  * The location of labels 2 and 3 changed, and at the new location the
  * uberblock rings are either empty or contain garbage.  The sync will write
  * new configs there because the vdev is dirty, but expansion also needs the
  * uberblock rings copied.  Read them from label 0 which did not move.
  *
  * Since the point is to populate labels {2,3} with valid uberblocks,
  * we zero uberblocks we fail to read or which are not valid.
  */
 
 static void
 vdev_copy_uberblocks(vdev_t *vd)
 {
 	abd_t *ub_abd;
 	zio_t *write_zio;
 	int locks = (SCL_L2ARC | SCL_ZIO);
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) ==
 	    SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * No uberblocks are stored on distributed spares, they may be
 	 * safely skipped when expanding a leaf vdev.
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
 
 	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
 
 	write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
 	for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 		const int src_label = 0;
 		zio_t *zio;
 
 		zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
 		vdev_label_read(zio, vd, src_label, ub_abd,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
 		    NULL, NULL, flags);
 
 		if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd)))
 			abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 
 		for (int l = 2; l < VDEV_LABELS; l++)
 			vdev_label_write(write_zio, vd, l, ub_abd,
 			    VDEV_UBERBLOCK_OFFSET(vd, n),
 			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL,
 			    flags | ZIO_FLAG_DONT_PROPAGATE);
 	}
 	(void) zio_wait(write_zio);
 
 	spa_config_exit(vd->vdev_spa, locks, FTAG);
 
 	abd_free(ub_abd);
 }
 
 /*
  * On success, increment root zio's count of good writes.
  * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
  */
 static void
 vdev_uberblock_sync_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
 		atomic_inc_64(good_writes);
 }
 
 /*
  * Write the uberblock to all labels of all leaves of the specified vdev.
  */
 static void
 vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
     uberblock_t *ub, vdev_t *vd, int flags)
 {
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		vdev_uberblock_sync(zio, good_writes,
 		    ub, vd->vdev_child[c], flags);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
 	if (!vdev_writeable(vd))
 		return;
 
 	/*
 	 * There's no need to write uberblocks to a distributed spare, they
 	 * are already stored on all the leaves of the parent dRAID.  For
 	 * this same reason vdev_uberblock_load_impl() skips distributed
 	 * spares when reading uberblocks.
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	/* If the vdev was expanded, need to copy uberblock rings. */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    vd->vdev_copy_uberblocks == B_TRUE) {
 		vdev_copy_uberblocks(vd);
 		vd->vdev_copy_uberblocks = B_FALSE;
 	}
 
 	/*
 	 * We chose a slot based on the txg.  If this uberblock has a special
 	 * RAIDZ expansion state, then it is essentially an update of the
 	 * current uberblock (it has the same txg).  However, the current
 	 * state is committed, so we want to write it to a different slot. If
 	 * we overwrote the same slot, and we lose power during the uberblock
 	 * write, and the disk does not do single-sector overwrites
 	 * atomically (even though it is required to - i.e. we should see
 	 * either the old or the new uberblock), then we could lose this
 	 * txg's uberblock. Rewinding to the previous txg's uberblock may not
 	 * be possible because RAIDZ expansion may have already overwritten
 	 * some of the data, so we need the progress indicator in the
 	 * uberblock.
 	 */
 	int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
 	int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) %
 	    (VDEV_UBERBLOCK_COUNT(vd) - m);
 
 	/* Copy the uberblock_t into the ABD */
 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
 
 	for (int l = 0; l < VDEV_LABELS; l++)
 		vdev_label_write(zio, vd, l, ub_abd,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
 		    vdev_uberblock_sync_done, good_writes,
 		    flags | ZIO_FLAG_DONT_PROPAGATE);
 
 	abd_free(ub_abd);
 }
 
 /* Sync the uberblocks to all vdevs in svd[] */
 int
 vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	zio_t *zio;
 	uint64_t good_writes = 0;
 
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int v = 0; v < svdcount; v++)
 		vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
 
 	(void) zio_wait(zio);
 
 	/*
 	 * Flush the uberblocks to disk.  This ensures that the odd labels
 	 * are no longer needed (because the new uberblocks and the even
 	 * labels are safely on disk), so it is safe to overwrite them.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int v = 0; v < svdcount; v++) {
 		if (vdev_writeable(svd[v])) {
 			zio_flush(zio, svd[v]);
 		}
 	}
 
 	(void) zio_wait(zio);
 
 	return (good_writes >= 1 ? 0 : EIO);
 }
 
 /*
  * On success, increment the count of good writes for our top-level vdev.
  */
 static void
 vdev_label_sync_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0)
 		atomic_inc_64(good_writes);
 }
 
 /*
  * If there weren't enough good writes, indicate failure to the parent.
  */
 static void
 vdev_label_sync_top_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (*good_writes == 0)
 		zio->io_error = SET_ERROR(EIO);
 
 	kmem_free(good_writes, sizeof (uint64_t));
 }
 
 /*
  * We ignore errors for log and cache devices, simply free the private data.
  */
 static void
 vdev_label_sync_ignore_done(zio_t *zio)
 {
 	kmem_free(zio->io_private, sizeof (uint64_t));
 }
 
 /*
  * Write all even or odd labels to all leaves of the specified vdev.
  */
 static void
 vdev_label_sync(zio_t *zio, uint64_t *good_writes,
     vdev_t *vd, int l, uint64_t txg, int flags)
 {
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	abd_t *vp_abd;
 	char *buf;
 	size_t buflen;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_label_sync(zio, good_writes,
 		    vd->vdev_child[c], l, txg, flags);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
 	if (!vdev_writeable(vd))
 		return;
 
 	/*
 	 * The top-level config never needs to be written to a distributed
 	 * spare.  When read vdev_dspare_label_read_config() will generate
 	 * the config for the vdev_label_read_config().
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	/*
 	 * Generate a label describing the top-level config to which we belong.
 	 */
 	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
 
 	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 	abd_zero(vp_abd, sizeof (vdev_phys_t));
 	vp = abd_to_buf(vp_abd);
 
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
 	if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) {
 		for (; l < VDEV_LABELS; l += 2) {
 			vdev_label_write(zio, vd, l, vp_abd,
 			    offsetof(vdev_label_t, vl_vdev_phys),
 			    sizeof (vdev_phys_t),
 			    vdev_label_sync_done, good_writes,
 			    flags | ZIO_FLAG_DONT_PROPAGATE);
 		}
 	}
 
 	abd_free(vp_abd);
 	nvlist_free(label);
 }
 
 static int
 vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 {
 	list_t *dl = &spa->spa_config_dirty_list;
 	vdev_t *vd;
 	zio_t *zio;
 	int error;
 
 	/*
 	 * Write the new labels to disk.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
 		uint64_t *good_writes;
 
 		ASSERT(!vd->vdev_ishole);
 
 		good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 		zio_t *vio = zio_null(zio, spa, NULL,
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
 		vdev_label_sync(vio, good_writes, vd, l, txg, flags);
 		zio_nowait(vio);
 	}
 
 	error = zio_wait(zio);
 
 	/*
 	 * Flush the new labels to disk.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
 		zio_flush(zio, vd);
 
 	(void) zio_wait(zio);
 
 	return (error);
 }
 
 /*
  * Sync the uberblock and any changes to the vdev configuration.
  *
  * The order of operations is carefully crafted to ensure that
  * if the system panics or loses power at any time, the state on disk
  * is still transactionally consistent.  The in-line comments below
  * describe the failure semantics at each stage.
  *
  * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
  * at any time, you can just call it again, and it will resume its work.
  */
 int
 vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
 	ASSERT(svdcount != 0);
 retry:
 	/*
 	 * Normally, we don't want to try too hard to write every label and
 	 * uberblock.  If there is a flaky disk, we don't want the rest of the
 	 * sync process to block while we retry.  But if we can't write a
 	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
 	 * bailing out and declaring the pool faulted.
 	 */
 	if (error != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0)
 			return (error);
 		flags |= ZIO_FLAG_TRYHARD;
 	}
 
 	ASSERT(ub->ub_txg <= txg);
 
 	/*
 	 * If this isn't a resync due to I/O errors,
 	 * and nothing changed in this transaction group,
 	 * and the vdev configuration hasn't changed,
 	 * then there's nothing to do.
 	 */
 	if (ub->ub_txg < txg) {
 		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
 		    txg, spa->spa_mmp.mmp_delay);
 
 		if (!changed && list_is_empty(&spa->spa_config_dirty_list))
 			return (0);
 	}
 
 	if (txg > spa_freeze_txg(spa))
 		return (0);
 
 	ASSERT(txg <= spa->spa_final_txg);
 
 	/*
 	 * Flush the write cache of every disk that's been written to
 	 * in this transaction group.  This ensures that all blocks
 	 * written in this txg will be committed to stable storage
 	 * before any uberblock that references them.
 	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vdev_t *vd =
 	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
 	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
 		zio_flush(zio, vd);
 
 	(void) zio_wait(zio);
 
 	/*
 	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
 	 * system dies in the middle of this process, that's OK: all of the
 	 * even labels that made it to disk will be newer than any uberblock,
 	 * and will therefore be considered invalid.  The odd labels (L1, L3),
 	 * which have not yet been touched, will still be valid.  We flush
 	 * the new labels to disk to ensure that all even-label updates
 	 * are committed to stable storage before the uberblock update.
 	 */
 	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
 			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
 			    "for pool '%s' when syncing out the even labels "
 			    "of dirty vdevs", error, spa_name(spa));
 		}
 		goto retry;
 	}
 
 	/*
 	 * Sync the uberblocks to all vdevs in svd[].
 	 * If the system dies in the middle of this step, there are two cases
 	 * to consider, and the on-disk state is consistent either way:
 	 *
 	 * (1)	If none of the new uberblocks made it to disk, then the
 	 *	previous uberblock will be the newest, and the odd labels
 	 *	(which had not yet been touched) will be valid with respect
 	 *	to that uberblock.
 	 *
 	 * (2)	If one or more new uberblocks made it to disk, then they
 	 *	will be the newest, and the even labels (which had all
 	 *	been successfully committed) will be valid with respect
 	 *	to the new uberblocks.
 	 */
 	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
 			zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
 			    "%d for pool '%s'", error, spa_name(spa));
 		}
 		goto retry;
 	}
 
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, ub);
 
 	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
 	 * in the middle of this process, the even labels and the new
 	 * uberblocks will suffice to open the pool.  The next time
 	 * the pool is opened, the first thing we'll do -- before any
 	 * user data is modified -- is mark every vdev dirty so that
 	 * all labels will be brought up to date.  We flush the new labels
 	 * to disk to ensure that all odd-label updates are committed to
 	 * stable storage before the next transaction group begins.
 	 */
 	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
 			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
 			    "for pool '%s' when syncing out the odd labels of "
 			    "dirty vdevs", error, spa_name(spa));
 		}
 		goto retry;
 	}
 
 	return (0);
 }
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index ce2cb8b1446a..b2699caa7589 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -1,4222 +1,4222 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2018 Datto Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/wmsum.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  * calls that change the file system. Each itx has enough information to
  * be able to replay them after a system crash, power loss, or
  * equivalent failure mode. These are stored in memory until either:
  *
  *   1. they are committed to the pool by the DMU transaction group
  *      (txg), at which point they can be discarded; or
  *   2. they are committed to the on-disk ZIL for the dataset being
  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  *      requirement).
  *
  * In the event of a crash or power loss, the itxs contained by each
  * dataset's on-disk ZIL will be replayed when that dataset is first
  * instantiated (e.g. if the dataset is a normal filesystem, when it is
  * first mounted).
  *
  * As hinted at above, there is one ZIL per dataset (both the in-memory
  * representation, and the on-disk representation). The on-disk format
  * consists of 3 parts:
  *
  * 	- a single, per-dataset, ZIL header; which points to a chain of
  * 	- zero or more ZIL blocks; each of which contains
  * 	- zero or more ZIL records
  *
  * A ZIL record holds the information necessary to replay a single
  * system call transaction. A ZIL block can hold many ZIL records, and
  * the blocks are chained together, similarly to a singly linked list.
  *
  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  * block in the chain, and the ZIL header points to the first block in
  * the chain.
  *
  * Note, there is not a fixed place in the pool to hold these ZIL
  * blocks; they are dynamically allocated and freed as needed from the
  * blocks available on the pool, though they can be preferentially
  * allocated from a dedicated "log" vdev.
  */
 
 /*
  * This controls the amount of time that a ZIL block (lwb) will remain
  * "open" when it isn't "full", and it has a thread waiting for it to be
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
 static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
  */
 static zil_kstat_values_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 },
 };
 
 static zil_sums_t zil_sums_global;
 static kstat_t *zil_kstats_global;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
  * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
  * the disk(s) by the ZIL after an LWB write has completed. Setting this
  * will cause ZIL corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 static int zil_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_0]));
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_1]));
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 static int
 zil_kstats_global_update(kstat_t *ksp, int rw)
 {
 	zil_kstat_values_t *zs = ksp->ks_data;
 	ASSERT3P(&zil_stats, ==, zs);
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	}
 
 	zil_kstat_values_update(zs, &zil_sums_global);
 
 	return (0);
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
     blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	if (!decrypt)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		uint64_t size = BP_GET_LSIZE(bp);
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = (*abuf)->b_data;
 			char *lr = (char *)(zilc + 1);
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    zilc->zc_nused < sizeof (*zilc) ||
 			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused - sizeof (*zilc);
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = (*abuf)->b_data;
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	/*
 	 * If we are not using the resulting data, we are just checking that
 	 * it hasn't been corrupted so we don't need to waste CPU time
 	 * decompressing and decrypting it.
 	 */
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 void
 zil_sums_init(zil_sums_t *zs)
 {
 	wmsum_init(&zs->zil_commit_count, 0);
 	wmsum_init(&zs->zil_commit_writer_count, 0);
 	wmsum_init(&zs->zil_itx_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_bytes, 0);
 	wmsum_init(&zs->zil_itx_copied_count, 0);
 	wmsum_init(&zs->zil_itx_copied_bytes, 0);
 	wmsum_init(&zs->zil_itx_needcopy_count, 0);
 	wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
 }
 
 void
 zil_sums_fini(zil_sums_t *zs)
 {
 	wmsum_fini(&zs->zil_commit_count);
 	wmsum_fini(&zs->zil_commit_writer_count);
 	wmsum_fini(&zs->zil_itx_count);
 	wmsum_fini(&zs->zil_itx_indirect_count);
 	wmsum_fini(&zs->zil_itx_indirect_bytes);
 	wmsum_fini(&zs->zil_itx_copied_count);
 	wmsum_fini(&zs->zil_itx_copied_bytes);
 	wmsum_fini(&zs->zil_itx_needcopy_count);
 	wmsum_fini(&zs->zil_itx_needcopy_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_count);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_write);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_count);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_write);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
 }
 
 void
 zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
 {
 	zs->zil_commit_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_count);
 	zs->zil_commit_writer_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_writer_count);
 	zs->zil_itx_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_count);
 	zs->zil_itx_indirect_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_count);
 	zs->zil_itx_indirect_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_bytes);
 	zs->zil_itx_copied_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_count);
 	zs->zil_itx_copied_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_bytes);
 	zs->zil_itx_needcopy_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_count);
 	zs->zil_itx_needcopy_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
 	zs->zil_itx_metaslab_normal_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
 	zs->zil_itx_metaslab_normal_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
 	zs->zil_itx_metaslab_normal_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
 	zs->zil_itx_metaslab_normal_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
 	zs->zil_itx_metaslab_slog_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
 	zs->zil_itx_metaslab_slog_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
 	zs->zil_itx_metaslab_slog_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
 	zs->zil_itx_metaslab_slog_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk = {{{{0}}}};
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *lrp, *end;
 		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 
 		error = parse_blk_func(zilog, &blk, arg, txg);
 		if (error != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 		    &lrp, &end, &abuf);
 		if (error != 0) {
 			if (abuf)
 				arc_buf_destroy(abuf, &abuf);
 			if (claimed) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				dmu_objset_name(zilog->zl_os, name);
 
 				cmn_err(CE_WARN, "ZFS read log block error %d, "
 				    "dataset %s, seq 0x%llx\n", error, name,
 				    (u_longlong_t)blk_seq);
 			}
 			break;
 		}
 
 		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 
 			error = parse_lr_func(zilog, lr, arg, txg);
 			if (error != 0) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 		arc_buf_destroy(abuf, &abuf);
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	zil_bp_tree_fini(zilog);
 
 	return (error);
 }
 
 static int
 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	(void) tx;
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
 	 * As we call this function from the context of a rewind to a
 	 * checkpoint, each ZIL block whose txg is later than the txg
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
 	if (bp->blk_birth >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	zio_free(zilog->zl_spa, first_txg, bp);
 	return (0);
 }
 
 static int
 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	(void) zilog, (void) lrc, (void) tx, (void) first_txg;
 	return (0);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
 	ASSERT(lrc->lrc_txtype == TX_WRITE);
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (lr->lr_blkptr.blk_birth >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 static int
 zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
 	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	/*
 	 * XXX: Do we need to byteswap lr?
 	 */
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_claim_write(zilog, lrc, tx, first_txg));
 	case TX_CLONE_RANGE:
 		return (zil_claim_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
 	(void) claim_txg;
 
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
 	ASSERT(lrc->lrc_txtype == TX_WRITE);
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 	    !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
 	return (0);
 }
 
 static int
 zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
 	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t claim_txg)
 {
 
 	if (claim_txg == 0) {
 		return (0);
 	}
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_free_write(zilog, lrc, tx, claim_txg));
 	case TX_CLONE_RANGE:
 		return (zil_free_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	return (TREE_CMP(v1, v2));
 }
 
 /*
  * Allocate a new lwb.  We may already have a block pointer for it, in which
  * case we get size and version from there.  Or we may not yet, in which case
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
     uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
 		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
 		sz = BP_GET_LSIZE(bp);
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
 		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
 		    SPA_VERSION_SLIM_ZIL);
 	}
 	lwb->lwb_slog = slog;
 	lwb->lwb_error = 0;
 	if (lwb->lwb_slim) {
 		lwb->lwb_nmax = sz;
 		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
 	} else {
 		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 		lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	}
 	lwb->lwb_sz = sz;
 	lwb->lwb_state = state;
 	lwb->lwb_buf = zio_buf_alloc(sz);
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	lwb->lwb_issued_txg = 0;
 	lwb->lwb_alloc_txg = txg;
 	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	if (state != LWB_STATE_NEW)
 		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
 }
 
 static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
 	 * valid, and prevent use-after-free errors.
 	 */
 	if (zilog->zl_last_lwb_opened == lwb)
 		zilog->zl_last_lwb_opened = NULL;
 
 	kmem_cache_free(zil_lwb_cache, lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 static void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 
 		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 static boolean_t __maybe_unused
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 static boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
  * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
  * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
  * zil_commit.
  */
 static void
 zil_commit_activate_saxattr_feature(zilog_t *zilog)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 
 	if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
 	    !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(ds, tx);
 		txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 		    (void *)B_TRUE;
 		mutex_exit(&ds->ds_lock);
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t slog = FALSE;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianness
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block pointer into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		/*
 		 * If "zilsaxattr" feature is enabled on zpool, then activate
 		 * it now when we're creating the ZIL chain. We can't wait with
 		 * this until we write the first xattr log record because we
 		 * need to wait for the feature activation to sync out.
 		 */
 		if (spa_feature_is_enabled(zilog->zl_spa,
 		    SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
 		    DMU_OST_ZVOL) {
 			mutex_enter(&ds->ds_lock);
 			ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 			    (void *)B_TRUE;
 			mutex_exit(&ds->ds_lock);
 		}
 
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	} else {
 		/*
 		 * This branch covers the case where we enable the feature on a
 		 * zpool that has existing ZIL headers.
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 	}
 	IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
 	    dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
 
 	ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 	IMPLY(error == 0, lwb != NULL);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header. If keep_first
  * is set, then we're replaying a log with no content. We want to keep the
  * first block, however, so that the first synchronous transaction doesn't
  * require a txg_wait_synced() in zil_create(). We don't need to
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return (B_FALSE);
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			if (!BP_IS_HOLE(&lwb->lwb_blk))
 				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 
 	return (B_TRUE);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	zilog_t *zilog;
 	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
 	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
 	/*
 	 * If the spa_log_state is not set to be cleared, check whether
 	 * the current uberblock is a checkpoint one and if the current
 	 * header has been claimed before moving on.
 	 *
 	 * If the current uberblock is a checkpointed uberblock then
 	 * one of the following scenarios took place:
 	 *
 	 * 1] We are currently rewinding to the checkpoint of the pool.
 	 * 2] We crashed in the middle of a checkpoint rewind but we
 	 *    did manage to write the checkpointed uberblock to the
 	 *    vdev labels, so when we tried to import the pool again
 	 *    the checkpointed uberblock was selected from the import
 	 *    procedure.
 	 *
 	 * In both cases we want to zero out all the ZIL blocks, except
 	 * the ones that have been claimed at the time of the checkpoint
 	 * (their zh_claim_txg != 0). The reason is that these blocks
 	 * may be corrupted since we may have reused their locations on
 	 * disk after we took the checkpoint.
 	 *
 	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
 	 * when we first figure out whether the current uberblock is
 	 * checkpointed or not. Unfortunately, that would discard all
 	 * the logs, including the ones that are claimed, and we would
 	 * leak space.
 	 */
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
 	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)) {
 		if (!BP_IS_HOLE(&zh->zh_log)) {
 			(void) zil_parse(zilog, zil_clear_log_block,
 			    zil_noop_log_record, tx, first_txg, B_FALSE);
 		}
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, B_FALSE, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we are not rewinding and opening the pool normally, then
 	 * the min_claim_txg should be equal to the first txg of the pool.
 	 */
 	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg, B_FALSE);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, B_FALSE, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	(void) dp;
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		/*
 		 * Check the first block and determine if it's on a log device
 		 * which may have been removed or faulted prior to loading this
 		 * pool.  If so, there's no point in checking the rest of the
 		 * log as its content should have already been synced to the
 		 * pool.
 		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 
 		/*
 		 * Check whether the current uberblock is checkpointed (e.g.
 		 * we are rewinding) and whether the current header has been
 		 * claimed or not. If it hasn't then skip verifying it. We
 		 * do this because its ZIL blocks may be part of the pool's
 		 * state before the rewind, which is no longer valid.
 		 */
 		zil_header_t *zh = zil_header_in_syncing_context(zilog);
 		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 		    zh->zh_claim_txg == 0)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL :
 	    spa_min_claim_txg(os->os_spa), B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 /*
  * When an itx is "skipped", this function is used to properly mark the
  * waiter as "done, and signal any thread(s) waiting on it. An itx can
  * be skipped (and not committed to an lwb) for a variety of reasons,
  * one of them being that the itx was committed via spa_sync(), prior to
  * it being committed to an lwb; this can happen if a thread calling
  * zil_commit() is racing with spa_sync().
  */
 static void
 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when the given waiter is to be linked into an
  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
  * At this point, the waiter will no longer be referenced by the itx,
  * and instead, will be referenced by the lwb.
  */
 static void
 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
 	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
 	 * zl_issuer_lock also protects leaving the open state.
 	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
 	 * flush_done, which transition is protected by zl_lock.
 	 */
 	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
 	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
 	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(&lwb->lwb_waiters, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
 }
 
 /*
  * This function is used when zio_alloc_zil() fails to allocate a ZIL
  * block, and the given waiter must be linked to the "nolwb waiters"
  * list inside of zil_process_commit_list().
  */
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(nolwb, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	if (zil_nocacheflush)
 		return;
 
 	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
 static void
 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 {
 	avl_tree_t *src = &lwb->lwb_vdev_tree;
 	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
 	 * not need the protection of lwb_vdev_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
 	mutex_enter(&nlwb->lwb_vdev_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
 	 */
 	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
 		avl_index_t where;
 
 		if (avl_find(dst, zv, &where) == NULL) {
 			avl_insert(dst, zv, where);
 		} else {
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
 	mutex_exit(&nlwb->lwb_vdev_lock);
 }
 
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
  * This function is a called after all vdevs associated with a given lwb
  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
  * as the lwb write completes, if "zil_nocacheflush" is set. Further,
  * all "previous" lwb's will have completed before this function is
  * called; i.e. this function is called for all previous lwbs before
  * it's called for "this" lwb (enforced via zio the dependencies
  * configured in zil_lwb_set_zio_dependency()).
  *
  * The intention is for this function to be called as soon as the
  * contents of an lwb are considered "stable" on disk, and will survive
  * any sudden loss of power. At this point, any threads waiting for the
  * lwb to reach this state are signalled, and the "waiter" structures
  * are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
 	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
 
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
 		 * for ztest. We only update this value when all the log
 		 * writes succeeded, because ztest wants to ASSERT that
 		 * it got the whole log chain.
 		 */
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
 	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
 		 * order for this error handling to work correctly. This
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
 		 *
 		 * With that said, it's important to note that LWB flush
 		 * errors are not propagated up to the LWB root ZIO.
 		 * This is incorrect behavior, and results in VDEV flush
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
 
 		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 		zcw->zcw_done = B_TRUE;
 		cv_broadcast(&zcw->zcw_cv);
 
 		mutex_exit(&zcw->zcw_lock);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
 
 	/* Once we drop the lock, lwb may be freed by zil_sync(). */
 	mutex_exit(&zilog->zl_lock);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
 	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
 	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
 		cv_broadcast(&zilog->zl_lwb_io_cv);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 }
 
 /*
  * Wait for the completion of all issued write/flush of that txg provided.
  * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
  */
 static void
 zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 {
 	ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
 		cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&zilog->zl_lock);
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
 	while (lwb != NULL) {
 		if (lwb->lwb_issued_txg <= txg) {
 			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
 			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
 			IMPLY(lwb->lwb_issued_txg > 0,
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 		}
 		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
 		    lwb->lwb_buf == NULL);
 		lwb = list_next(&zilog->zl_lwb_list, lwb);
 	}
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lock);
 #endif
 }
 
 /*
  * This is called when an lwb's write zio completes. The callback's
  * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
  * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
  * in writing out this specific lwb's data, and in the case that cache
  * flushes have been deferred, vdevs involved in writing the data for
  * previous lwbs. The writes corresponding to all the vdevs in the
  * lwb_vdev_tree will have completed by the time this is called, due to
  * the zio dependencies configured in zil_lwb_set_zio_dependency(),
  * which takes deferred flushes into account. The lwb will be "done"
  * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
  * completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	lwb_t *nlwb;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	abd_free(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 
 	/*
 	 * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
 	 * called for it yet, and when it will be, it won't be able to make
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
 		nlwb = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/*
 	 * If there was an IO error, we're not going to call zio_flush()
 	 * on these vdevs, so we simply empty the tree and free the
 	 * nodes. We avoid calling zio_flush() since there isn't any
 	 * good reason for doing so, after the lwb block failed to be
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
 	 * this point (e.g. setting "zcw_zio_error" appropriately), as
 	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
 	 * we expect any error seen here, to have been propagated to
 	 * that function).
 	 */
 	if (zio->io_error != 0) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
 	}
 
 	/*
 	 * If this lwb does not have any threads waiting for it to
 	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
 	 * command to the vdevs written to by "this" lwb, and instead
 	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
 	 * command for those vdevs. Thus, we merge the vdev tree of
 	 * "this" lwb with the vdev tree of the "next" lwb in the list,
 	 * and assume the "next" lwb will handle flushing the vdevs (or
 	 * deferring the flush(s) again).
 	 *
 	 * This is a useful performance optimization, especially for
 	 * workloads with lots of async write activity and few sync
 	 * write and/or fsync activity, as it has the potential to
 	 * coalesce multiple flush commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
 	}
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL && !vd->vdev_nowritecache) {
+		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
 			 * any errors when flushing the vdev(s), will
 			 * (unfortunately) not be handled correctly,
 			 * since these "zio_flush" errors will not be
 			 * propagated up to "zil_lwb_flush_vdevs_done".
 			 */
 			zio_flush(lwb->lwb_root_zio, vd);
 		}
 		kmem_free(zv, sizeof (*zv));
 	}
 }
 
 /*
  * Build the zio dependency chain, which is used to preserve the ordering of
  * lwb completions that is required by the semantics of the ZIL. Each new lwb
  * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
  * cannot complete until the previous lwb's zio completes.
  *
  * This is required by the semantics of zil_commit(): the commit waiters
  * attached to the lwbs will be woken in the lwb zio's completion callback,
  * so this zio dependency graph ensures the waiters are woken in the correct
  * order (the same order the lwbs were created).
  */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
 	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
 	if (prev_lwb == NULL ||
 	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		return;
 
 	/*
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
 	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
 	 *
 	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
 	 * lwb will rely on this lwb to flush the vdevs written to by that
 	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
 	 * flush until after the previous lwb's write completes. We ensure
 	 * this ordering by setting the zio parent/child relationship here.
 	 *
 	 * Without this relationship on the lwb's write zio, it's possible
 	 * for this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
 	 * zil_lwb_write_done()).
 	 */
 	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
 		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
 		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
 	} else {
 		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
 
 	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
 	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. This function is idempotent; if
  * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (lwb->lwb_state != LWB_STATE_NEW) {
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 		return;
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
  * Define a limited set of intent log block sizes.
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
  * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 static const struct {
 	uint64_t	limit;
 	uint64_t	blksz;
 } zil_block_buckets[] = {
 	{ 4096,		4096 },			/* non TX_WRITE */
 	{ 8192 + 4096,	8192 + 4096 },		/* database */
 	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
 	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
 	{ 131072,	131072 },		/* < 128KB writes */
 	{ 131072 +4096,	65536 + 4096 },		/* 128KB writes */
 	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
 };
 
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
  * zl_max_block_size instead.
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
 	int i;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
 	 * If there was an allocation failure then returned NULL will trigger
 	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
 	 * since allocation may not have happened yet.
 	 */
 	if (lwb->lwb_error != 0)
 		return (NULL);
 
 	/*
 	 * Log blocks are pre-allocated. Here we select the size of the next
 	 * block, based on size used in the last block.
 	 * - first find the smallest bucket that will fit the block from a
 	 *   limited set of block sizes. This is because it's faster to write
 	 *   blocks allocated from the same metaslab as they are adjacent or
 	 *   close.
 	 * - next find the maximum from the new suggested size and an array of
 	 *   previous sizes. This lessens a picket fence effect of wrongly
 	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
 	 *   requests.
 	 *
 	 * Note we only write what is used, but we can't just allocate
 	 * the maximum block size because we can exhaust the available
 	 * pool log space.
 	 */
 	uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
 	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
 		continue;
 	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
 	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
 	    uint64_t, zil_blksz,
 	    uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
 }
 
 /*
  * Finalize previously closed block and issue the write zio.
  */
 static void
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	spa_t *spa = zilog->zl_spa;
 	zil_chain_t *zilc;
 	boolean_t slog;
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 	int error;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/* Actually fill the lwb with the data. */
 	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
 	    itx = list_next(&lwb->lwb_itxs, itx))
 		zil_lwb_commit(zilog, lwb, itx);
 	lwb->lwb_nused = lwb->lwb_nfilled;
 
 	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * The lwb is now ready to be issued, but it can be only if it already
 	 * got its block pointer allocated or the allocation has failed.
 	 * Otherwise leave it as-is, relying on some other thread to issue it
 	 * after allocating its block pointer via calling zil_lwb_write_issue()
 	 * for the previous lwb(s) in the chain.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_READY;
 	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
 		mutex_exit(&zilog->zl_lock);
 		return;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
 	if (lwb->lwb_slim)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
 		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
 		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
 		if (lwb->lwb_slim) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
 			ASSERT3S(wsz, <=, lwb->lwb_sz);
 			zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
 		zilc->zc_pad = 0;
 		zilc->zc_nused = lwb->lwb_nused;
 		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 	} else {
 		/*
 		 * We can't write the lwb if there was an allocation failure,
 		 * so create a null zio instead just to maintain dependencies.
 		 */
 		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
 		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
 		lwb->lwb_write_zio->io_error = lwb->lwb_error;
 	}
 	if (lwb->lwb_child_zio)
 		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
 	 * Open transaction to allocate the next block pointer.
 	 */
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * Allocate next the block pointer unless we are already in error.
 	 */
 	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	blkptr_t *bp = &zilc->zc_next_blk;
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
 		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
 		    &slog);
 	}
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
 
 	/*
 	 * Reduce TXG open time by incrementing inflight counter and committing
 	 * the transaciton.  zil_sync() will wait for it to return to zero.
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb->lwb_issued_txg = txg;
 	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
 	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	dmu_tx_commit(tx);
 
 	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
 	 * We've completed all potentially blocking operations.  Update the
 	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	zil_lwb_set_zio_dependency(zilog, lwb);
 	lwb->lwb_state = LWB_STATE_ISSUED;
 
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
 		nlwb->lwb_slog = slog;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (lwb->lwb_slog) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	} else {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
 	lwb->lwb_issued_timestamp = gethrtime();
 	if (lwb->lwb_child_zio)
 		zio_nowait(lwb->lwb_child_zio);
 	zio_nowait(lwb->lwb_write_zio);
 	zio_nowait(lwb->lwb_root_zio);
 
 	/*
 	 * If nlwb was ready when we gave it the block pointer,
 	 * it is on us to issue it and possibly following ones.
 	 */
 	lwb = nlwb;
 	if (lwb)
 		goto next_lwb;
 }
 
 /*
  * Maximum amount of data that can be put into single log block.
  */
 uint64_t
 zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
 	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
  */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
 	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }
 
 /*
  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
  * maximum sized log block, because each WR_COPIED record must fit in a
  * single log block.  Below that it is a tradeoff of additional memory copy
  * and possibly worse log space efficiency vs additional range lock/unlock.
  */
 static uint_t zil_maxcopied = 7680;
 
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
 	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	return (MIN(max_data, zil_maxcopied));
 }
 
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
  * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
  * to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 {
 	itx_t *citx;
 	lr_t *lr, *clr;
 	lr_write_t *lrw;
 	uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
 	 * provides a mechanism for attaching a "commit waiter" onto the
 	 * correct lwb (such that the waiter can be signalled upon
 	 * completion of that lwb). Thus, we don't process this itx's
 	 * log record if it's a commit itx (these itx's don't have log
 	 * records), and instead link the itx's waiter onto the lwb's
 	 * list of waiters.
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lr->lrc_txtype == TX_COMMIT) {
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
 		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
 
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
 		dlen = 0;
 	}
 	reclen = lr->lrc_reclen;
 	zilog->zl_cur_used += (reclen + dlen);
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
 		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 
 		/*
 		 * There must be enough space in the new, empty log block to
 		 * hold reclen.  For WR_COPIED, we need to fit the whole
 		 * record in one block, and reclen is the header size + the
 		 * data size. For WR_NEED_COPY, we can create multiple
 		 * records, splitting the data into multiple blocks, so we
 		 * only need to fit one word of data per block; in this case
 		 * reclen is just the header size (no data).
 		 */
 		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 	}
 
 	dnow = MIN(dlen, lwb_sp - reclen);
 	if (dlen > dnow) {
 		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
 		ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
 		citx = zil_itx_clone(itx);
 		clr = &citx->itx_lr;
 		lr_write_t *clrw = (lr_write_t *)clr;
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	clr->lrc_seq = ++zilog->zl_lr_seq;
 
 	lwb->lwb_nused += reclen + dnow;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	zil_lwb_add_txg(lwb, lr->lrc_txg);
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
 	if (dlen > 0) {
 		zilog->zl_cur_used += reclen;
 		goto cont;
 	}
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
 		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
 
 	return (lwb);
 }
 
 /*
  * Fill the actual transaction data into the lwb, following zil_lwb_assign().
  * Does not require locking.
  */
 static void
 zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 {
 	lr_t *lr, *lrb;
 	lr_write_t *lrw, *lrwb;
 	char *lr_buf;
 	uint64_t dlen, reclen;
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
 		dlen = 0;
 	}
 	reclen = lr->lrc_reclen;
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
 	memcpy(lr_buf, lr, reclen);
 	lrb = (lr_t *)lr_buf;		/* Like lr, but inside lwb. */
 	lrwb = (lr_write_t *)lrb;	/* Like lrw, but inside lwb. */
 
 	ZIL_STAT_BUMP(zilog, zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lr->lrc_txtype == TX_WRITE) {
 		if (itx->itx_wr_state == WR_COPIED) {
 			ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
 			ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
 			    lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrb->lrc_reclen += dlen;
 				ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
 				ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
 				    dlen);
 			} else {
 				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
 				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
 				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
 				if (lwb->lwb_child_zio == NULL) {
 					lwb->lwb_child_zio = zio_null(NULL,
 					    zilog->zl_spa, NULL, NULL, NULL,
 					    ZIO_FLAG_CANFAIL);
 				}
 			}
 
 			/*
 			 * The "lwb_child_zio" we pass in will become a child of
 			 * "lwb_write_zio", when one is created, so one will be
 			 * a parent of any zio's created by the "zl_get_data".
 			 * This way "lwb_write_zio" will first wait for children
 			 * block pointers before own writing, and then for their
 			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
 			    lwb->lwb_child_zio);
 			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
 				memset((char *)dbuf + lrwb->lr_length, 0,
 				    dlen - lrwb->lr_length);
 			}
 
 			/*
 			 * Typically, the only return values we should see from
 			 * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
 			 *  EALREADY. However, it is also possible to see other
 			 *  error values such as ENOSPC or EINVAL from
 			 *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
 			 *  ENXIO as well as a multitude of others from the
 			 *  block layer through dmu_buf_hold() -> dbuf_read()
 			 *  -> zio_wait(), as well as through dmu_read() ->
 			 *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
 			 *  zio_wait(). When these errors happen, we can assume
 			 *  that neither an immediate write nor an indirect
 			 *  write occurred, so we need to fall back to
 			 *  txg_wait_synced(). This is unusual, so we print to
 			 *  dmesg whenever one of these errors occurs.
 			 */
 			switch (error) {
 			case 0:
 				break;
 			default:
 				cmn_err(CE_WARN, "zil_lwb_commit() received "
 				    "unexpected error %d from ->zl_get_data()"
 				    ". Falling back to txg_wait_synced().",
 				    error);
 				zfs_fallthrough;
 			case EIO:
 				txg_wait_synced(zilog->zl_dmu_pool,
 				    lr->lrc_txg);
 				zfs_fallthrough;
 			case ENOENT:
 				zfs_fallthrough;
 			case EEXIST:
 				zfs_fallthrough;
 			case EALREADY:
 				return;
 			}
 		}
 	}
 
 	lwb->lwb_nfilled += reclen + dlen;
 	ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
 	ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t olrsize)
 {
 	size_t itxsize, lrsize;
 	itx_t *itx;
 
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	itx->itx_size = itxsize;
 
 	return (itx);
 }
 
 static itx_t *
 zil_itx_clone(itx_t *oitx)
 {
 	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
 	memcpy(itx, oitx, oitx->itx_size);
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 
 	if (itx->itx_callback != NULL)
 		itx->itx_callback(itx->itx_callback_data);
 
 	zio_data_buf_free(itx, itx->itx_size);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
 		 * zil_lwb_assign(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
 		 *	- a thread calls zil_commit() which assigns the
 		 *	  commit itx to a per-txg i_sync_list
 		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
 		 *	  while the waiter is still on the i_sync_list
 		 *
 		 * There's nothing to prevent syncing the txg while the
 		 * waiter is on the i_sync_list. This normally doesn't
 		 * happen because spa_sync() is slower than zil_commit(),
 		 * but if zil_commit() calls txg_wait_synced() (e.g.
 		 * because zil_create() or zil_commit_writer_stall() is
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (TREE_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		ian = avl_find(t, &oid, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", (u_longlong_t)itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
 		    KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t),
 			    KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
 	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
 	 * need to be careful to always dirty the ZIL using the "real"
 	 * TXG (not itxg_txg) even when the SPA is frozen.
 	 */
 	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been committed) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	ASSERT3U(synced_txg, <, ZILTEST_TXG);
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
 			 * Allow all earlier itxs to be committed, but ask
 			 * caller to do txg_wait_synced(txg) for any new.
 			 */
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
 	}
 	return (wtxg);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
 			ian = avl_find(t, &foid, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * This function will prune commit itxs that are at the head of the
  * commit list (it won't prune past the first non-commit itx), and
  * either: a) attach them to the last lwb that's still pending
  * completion, or b) skip them altogether.
  *
  * This is used as a performance optimization to prevent commit itxs
  * from generating new lwbs when it's unnecessary to do so.
  */
 static void
 zil_prune_commit_list(zilog_t *zilog)
 {
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		if (lrc->lrc_txtype != TX_COMMIT)
 			break;
 
 		mutex_enter(&zilog->zl_lock);
 
 		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
 		if (last_lwb == NULL ||
 		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
 			/*
 			 * All of the itxs this waiter was waiting on
 			 * must have already completed (or there were
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
 
 		mutex_exit(&zilog->zl_lock);
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 }
 
 static void
 zil_commit_writer_stall(zilog_t *zilog)
 {
 	/*
 	 * When zio_alloc_zil() fails to allocate the next lwb block on
 	 * disk, we must call txg_wait_synced() to ensure all of the
 	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
 	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
 	 * to zil_process_commit_list()) will have to call zil_create(),
 	 * and start a new ZIL chain.
 	 *
 	 * Since zil_alloc_zil() failed, the lwb that was previously
 	 * issued does not have a pointer to the "next" lwb on disk.
 	 * Thus, if another ZIL writer thread was to allocate the "next"
 	 * on-disk lwb, that block could be leaked in the event of a
 	 * crash (because the previous lwb on-disk would not point to
 	 * it).
 	 *
 	 * We must hold the zilog's zl_issuer_lock while we do this, to
 	 * ensure no new threads enter zil_process_commit_list() until
 	 * all lwb's in the zl_lwb_list have been synced and freed
 	 * (which is achieved via the txg_wait_synced() call).
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
 static void
 zil_burst_done(zilog_t *zilog)
 {
 	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
 	    zilog->zl_cur_used == 0)
 		return;
 
 	if (zilog->zl_parallel)
 		zilog->zl_parallel--;
 
 	zilog->zl_cur_used = 0;
 }
 
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
  * created lwbs. Additionally, as a new lwb is created, the previous
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
 zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_itxs;
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
 		/*
 		 * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
 		 * have already been created (zl_lwb_list not empty).
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * If the lwb is still opened, it means the workload is really
 		 * multi-threaded and we won the chance of write aggregation.
 		 * If it is not opened yet, but previous lwb is still not
 		 * flushed, it still means the workload is multi-threaded, but
 		 * there was too much time between the commits to aggregate, so
 		 * we try aggregation next times, but without too much hopes.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED) {
 			zilog->zl_parallel = ZIL_BURSTS;
 		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
 		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
 			zilog->zl_parallel = MAX(zilog->zl_parallel,
 			    ZIL_BURSTS / 2);
 		}
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
 		ASSERT3U(txg, !=, 0);
 
 		if (lrc->lrc_txtype == TX_COMMIT) {
 			DTRACE_PROBE2(zil__process__commit__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		} else {
 			DTRACE_PROBE2(zil__process__normal__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
 		/*
 		 * If the txg of this itx has already been synced out, then
 		 * we don't need to commit this itx to an lwb. This is
 		 * because the data of this itx will have already been
 		 * written to the main pool. This is inherently racy, and
 		 * it's still ok to commit an itx whose txg has already
 		 * been synced; this will result in a write that's
 		 * unnecessary, but will do no harm.
 		 *
 		 * With that said, we always want to commit TX_COMMIT itxs
 		 * to an lwb, regardless of whether or not that itx's txg
 		 * has been synced out. We do this to ensure any OPENED lwb
 		 * will always have at least one zil_commit_waiter_t linked
 		 * to the lwb.
 		 *
 		 * As a counter-example, if we skipped TX_COMMIT itx's
 		 * whose txg had already been synced, the following
 		 * situation could occur if we happened to be racing with
 		 * spa_sync:
 		 *
 		 * 1. We commit a non-TX_COMMIT itx to an lwb, where the
 		 *    itx's txg is 10 and the last synced txg is 9.
 		 * 2. spa_sync finishes syncing out txg 10.
 		 * 3. We move to the next itx in the list, it's a TX_COMMIT
 		 *    whose txg is 10, so we skip it rather than committing
 		 *    it to the lwb used in (1).
 		 *
 		 * If the itx that is skipped in (3) is the last TX_COMMIT
 		 * itx in the commit list, than it's possible for the lwb
 		 * used in (1) to remain in the OPENED state indefinitely.
 		 *
 		 * To prevent the above scenario from occurring, ensuring
 		 * that once an lwb is OPENED it will transition to ISSUED
 		 * and eventually DONE, we always commit TX_COMMIT itx's to
 		 * an lwb here, even if that itx's txg has already been
 		 * synced.
 		 *
 		 * Finally, if the pool is frozen, we _always_ commit the
 		 * itx.  The point of freezing the pool is to prevent data
 		 * from being written to the main pool via spa_sync, and
 		 * instead rely solely on the ZIL to persistently store the
 		 * data; i.e.  when the pool is frozen, the last synced txg
 		 * value can't be trusted.
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
 				if (lwb == NULL) {
 					list_insert_tail(&nolwb_itxs, itx);
 				} else if ((zcw->zcw_lwb != NULL &&
 				    zcw->zcw_lwb != lwb) || zcw->zcw_done) {
 					/*
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
 					zilog->zl_parallel = ZIL_BURSTS;
 					break;
 				}
 			} else {
 				if (lrc->lrc_txtype == TX_COMMIT) {
 					zil_commit_waiter_link_nolwb(
 					    itx->itx_private, &nolwb_waiters);
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 	}
 
 	if (lwb == NULL) {
 		/*
 		 * This indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
 		while ((lwb = list_remove_head(ilwbs)) != NULL)
 			zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
 		 * waiters as "done" here, since without an lwb, we
 		 * can't do this via zil_lwb_flush_vdevs_done() like
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
 		 * variable is in "new" or "opened" state.
 		 *
 		 * If it's "new", then no itxs have been committed to it, so
 		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
 		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
 		 * 1. Ideally, there will be more ZIL activity occurring on
 		 * the system, such that this function will be immediately
 		 * called again by different thread and this lwb will be
 		 * closed by zil_lwb_assign().  This way, the lwb will be
 		 * "full" when it is issued to disk, and we'll make use of
 		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
 		 * the system, zil_commit_waiter() will close it and issue
 		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
 		 *
 		 * We do this for a couple of reasons:
 		 *
 		 * 1. To try and reduce the number of IOPs needed to
 		 * write the same number of itxs. If an lwb has space
 		 * available in its buffer for more itxs, and more itxs
 		 * will be committed relatively soon (relative to the
 		 * latency of performing a write), then it's beneficial
 		 * to wait for these "next" itxs. This way, more itxs
 		 * can be committed to stable storage with fewer writes.
 		 *
 		 * 2. To try and use the largest lwb block size that the
 		 * incoming rate of itxs can support. Again, this is to
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 			zil_burst_done(zilog);
 			if (lwb == NULL) {
 				while ((lwb = list_remove_head(ilwbs)) != NULL)
 					zil_lwb_write_issue(zilog, lwb);
 				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
 }
 
 /*
  * This function is responsible for ensuring the passed in commit waiter
  * (and associated commit itx) is committed to an lwb. If the waiter is
  * not already committed to an lwb, all itxs in the zilog's queue of
  * itxs will be processed. The assumption is the passed in waiter's
  * commit itx will found in the queue just like the other non-commit
  * itxs, such that when the entire queue is processed, the waiter will
  * have been committed to an lwb.
  *
  * The lwb associated with the passed in waiter is not guaranteed to
  * have been issued by the time this function completes. If the lwb is
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
 static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	list_t ilwbs;
 	lwb_t *lwb;
 	uint64_t wtxg = 0;
 
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
 		/*
 		 * It's possible that, while we were waiting to acquire
 		 * the "zl_issuer_lock", another thread committed this
 		 * waiter to an lwb. If that occurs, we bail out early,
 		 * without processing any of the zilog's queue of itxs.
 		 *
 		 * On certain workloads and system configurations, the
 		 * "zl_issuer_lock" can become highly contended. In an
 		 * attempt to reduce this contention, we immediately drop
 		 * the lock if the waiter has already been processed.
 		 *
 		 * We've measured this optimization to reduce CPU spent
 		 * contending on this lock by up to 5%, using a system
 		 * with 32 CPUs, low latency storage (~50 usec writes),
 		 * and 1024 threads performing sync writes.
 		 */
 		goto out;
 	}
 
 	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
 	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog, zcw, &ilwbs);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 	while ((lwb = list_remove_head(&ilwbs)) != NULL)
 		zil_lwb_write_issue(zilog, lwb);
 	list_destroy(&ilwbs);
 	return (wtxg);
 }
 
 static void
 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
 	 * immediately return since there's no work to be done (the
 	 * point of this function is to issue the lwb). Additionally, we
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
 	 * In order to call zil_lwb_write_close() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are acquired in the opposite order
 	 * elsewhere.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * Since we just dropped and re-acquired the commit waiter's
 	 * lock, we have to re-check to see if the waiter was marked
 	 * "done" during that process. If the waiter was marked "done",
 	 * the "lwb" pointer is no longer valid (it can be free'd after
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
 	/*
 	 * We've already checked this above, but since we hadn't acquired
 	 * the zilog's zl_issuer_lock, we have to perform this check a
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
 	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
 	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
 	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
 	 * if it's OPENED or CLOSED, and block any other threads that might
 	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
 	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	/*
 	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
 	 * is still open.  But we have to drop it to avoid a deadlock in case
 	 * callback of zio issued by zil_lwb_write_issue() try to get it,
 	 * while zil_lwb_write_issue() is blocked on attempt to issue next
 	 * lwb it found in LWB_STATE_READY state.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
 	 * zil_process_commit_list(), we need to issue this lwb's zio
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	zil_burst_done(zilog);
 
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
 		 */
 		zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 		mutex_exit(&zilog->zl_issuer_lock);
 	} else {
 		mutex_exit(&zilog->zl_issuer_lock);
 		zil_lwb_write_issue(zilog, lwb);
 	}
 	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
  * This function is responsible for performing the following two tasks:
  *
  * 1. its primary responsibility is to block until the given "commit
  *    waiter" is considered "done".
  *
  * 2. its secondary responsibility is to issue the zio for the lwb that
  *    the given "commit waiter" is waiting on, if this function has
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
  * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
  *
  * For more details, see zil_process_commit_list(); more specifically,
  * the comment at the bottom of that function.
  */
 static void
 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * The timeout is scaled based on the lwb latency to avoid
 	 * significantly impacting the latency of each individual itx.
 	 * For more details, see the comment at the bottom of the
 	 * zil_process_commit_list() function.
 	 */
 	int pct = MAX(zfs_commit_timeout_pct, 1);
 	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
 	hrtime_t wakeup = gethrtime() + sleep;
 	boolean_t timedout = B_FALSE;
 
 	while (!zcw->zcw_done) {
 		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 
 		lwb_t *lwb = zcw->zcw_lwb;
 
 		/*
 		 * Usually, the waiter will have a non-NULL lwb field here,
 		 * but it's possible for it to be NULL as a result of
 		 * zil_commit() racing with spa_sync().
 		 *
 		 * When zil_clean() is called, it's possible for the itxg
 		 * list (which may be cleaned via a taskq) to contain
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
 		 * marked done until zil_commit_waiter_skip() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
 		 * "zcw" variable) to be found in this "in between" state;
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
 		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
 
 			/*
 			 * If the lwb hasn't been issued yet, then we
 			 * need to wait with a timeout, in case this
 			 * function needs to issue the lwb after the
 			 * timeout is reached; responsibility (2) from
 			 * the comment above this function.
 			 */
 			int rc = cv_timedwait_hires(&zcw->zcw_cv,
 			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
 			    CALLOUT_FLAG_ABSOLUTE);
 
 			if (rc != -1 || zcw->zcw_done)
 				continue;
 
 			timedout = B_TRUE;
 			zil_commit_waiter_timeout(zilog, zcw);
 
 			if (!zcw->zcw_done) {
 				/*
 				 * If the commit waiter has already been
 				 * marked "done", it's possible for the
 				 * waiter's lwb structure to have already
 				 * been freed.  Thus, we can only reliably
 				 * make these assertions if the waiter
 				 * isn't done.
 				 */
 				ASSERT3P(lwb, ==, zcw->zcw_lwb);
 				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 			}
 		} else {
 			/*
 			 * If the lwb isn't open, then it must have already
 			 * been issued. In that case, there's no need to
 			 * use a timeout when waiting for the lwb to
 			 * complete.
 			 *
 			 * Additionally, if the lwb is NULL, the waiter
 			 * will soon be signaled and marked done via
 			 * zil_clean() and zil_itxg_clean(), so no timeout
 			 * is required.
 			 */
 
 			IMPLY(lwb != NULL,
 			    lwb->lwb_state == LWB_STATE_CLOSED ||
 			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
 		}
 	}
 
 	mutex_exit(&zcw->zcw_lock);
 }
 
 static zil_commit_waiter_t *
 zil_alloc_commit_waiter(void)
 {
 	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
 
 	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
 	zcw->zcw_zio_error = 0;
 
 	return (zcw);
 }
 
 static void
 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
 	mutex_destroy(&zcw->zcw_lock);
 	cv_destroy(&zcw->zcw_cv);
 	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
  * This function is used to create a TX_COMMIT itx and assign it. This
  * way, it will be linked into the ZIL's list of synchronous itxs, and
  * then later committed to an lwb (or skipped) when
  * zil_process_commit_list() is called.
  */
 static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data, and we
 	 * can even help with clearing the existing dirty data, we
 	 * should not be subject to the dirty data based delays. We
 	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
 	itx->itx_private = zcw;
 
 	zil_itx_assign(zilog, itx, tx);
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Commit ZFS Intent Log transactions (itxs) to stable storage.
  *
  * When writing ZIL transactions to the on-disk representation of the
  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
  * itxs can be committed to a single lwb. Once a lwb is written and
  * committed to stable storage (i.e. the lwb is written, and vdevs have
  * been flushed), each itx that was committed to that lwb is also
  * considered to be committed to stable storage.
  *
  * When an itx is committed to an lwb, the log record (lr_t) contained
  * by the itx is copied into the lwb's zio buffer, and once this buffer
  * is written to disk, it becomes an on-disk ZIL block.
  *
  * As itxs are generated, they're inserted into the ZIL's queue of
  * uncommitted itxs. The semantics of zil_commit() are such that it will
  * block until all itxs that were in the queue when it was called, are
  * committed to stable storage.
  *
  * If "foid" is zero, this means all "synchronous" and "asynchronous"
  * itxs, for all objects in the dataset, will be committed to stable
  * storage prior to zil_commit() returning. If "foid" is non-zero, all
  * "synchronous" itxs for all objects, but only "asynchronous" itxs
  * that correspond to the foid passed in, will be committed to stable
  * storage prior to zil_commit() returning.
  *
  * Generally speaking, when zil_commit() is called, the consumer doesn't
  * actually care about _all_ of the uncommitted itxs. Instead, they're
  * simply trying to waiting for a specific itx to be committed to disk,
  * but the interface(s) for interacting with the ZIL don't allow such
  * fine-grained communication. A better interface would allow a consumer
  * to create and assign an itx, and then pass a reference to this itx to
  * zil_commit(); such that zil_commit() would return as soon as that
  * specific itx was committed to disk (instead of waiting for _all_
  * itxs to be committed).
  *
  * When a thread calls zil_commit() a special "commit itx" will be
  * generated, along with a corresponding "waiter" for this commit itx.
  * zil_commit() will wait on this waiter's CV, such that when the waiter
  * is marked done, and signaled, zil_commit() will return.
  *
  * This commit itx is inserted into the queue of uncommitted itxs. This
  * provides an easy mechanism for determining which itxs were in the
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
  * The commit itx is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signaled
  * -- allowing the thread waiting on the waiter to return from zil_commit().
  *
  * It's important to point out a few critical factors that allow us
  * to make use of the commit itxs, commit waiters, per-lwb lists of
  * commit waiters, and zio completion callbacks like we're doing:
  *
  *   1. The list of waiters for each lwb is traversed, and each commit
  *      waiter is marked "done" and signaled, in the zio completion
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
  *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
  *        that are sent to the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
  *      itxs are added to the queue, they are added to the tail of
  *      in-memory linked lists.
  *
  *      When committing the itxs to lwbs (to be written to disk), they
  *      are committed in the same order in which the itxs were added to
  *      the uncommitted queue's linked list(s); i.e. the linked list of
  *      itxs to commit is traversed from head to tail, and each itx is
  *      committed to an lwb in that order.
  *
  *      * To clarify:
  *
  *        - the order of "sync" itxs is preserved w.r.t. other
  *          "sync" itxs, regardless of the corresponding objects.
  *        - the order of "async" itxs is preserved w.r.t. other
  *          "async" itxs corresponding to the same object.
  *        - the order of "async" itxs is *not* preserved w.r.t. other
  *          "async" itxs corresponding to different objects.
  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
  *          versa) is *not* preserved, even for itxs that correspond
  *          to the same object.
  *
  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
  *      zil_get_commit_list(), and zil_process_commit_list().
  *
  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
  *      lwb cannot be considered committed to stable storage, until its
  *      "previous" lwb is also committed to stable storage. This fact,
  *      coupled with the fact described above, means that itxs are
  *      committed in (roughly) the order in which they were generated.
  *      This is essential because itxs are dependent on prior itxs.
  *      Thus, we *must not* deem an itx as being committed to stable
  *      storage, until *all* prior itxs have also been committed to
  *      stable storage.
  *
  *      To enforce this ordering of lwb zio's, while still leveraging as
  *      much of the underlying storage performance as possible, we rely
  *      on two fundamental concepts:
  *
  *          1. The creation and issuance of lwb zio's is protected by
  *             the zilog's "zl_issuer_lock", which ensures only a single
  *             thread is creating and/or issuing lwb's at a time
  *          2. The "previous" lwb is a child of the "current" lwb
  *             (leveraging the zio parent-child dependency graph)
  *
  *      By relying on this parent-child zio relationship, we can have
  *      many lwb zio's concurrently issued to the underlying storage,
  *      but the order in which they complete will be the same order in
  *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * We should never attempt to call zil_commit on a snapshot for
 	 * a couple of reasons:
 	 *
 	 * 1. A snapshot may never be modified, thus it cannot have any
 	 *    in-flight itxs that would have modified the dataset.
 	 *
 	 * 2. By design, when zil_commit() is called, a commit itx will
 	 *    be assigned to this zilog; as a result, the zilog will be
 	 *    dirtied. We must not dirty the zilog of a snapshot; there's
 	 *    checks in the code that enforce this invariant, and will
 	 *    cause a panic if it's not upheld.
 	 */
 	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	if (!spa_writeable(zilog->zl_spa)) {
 		/*
 		 * If the SPA is not writable, there should never be any
 		 * pending itxs waiting to be committed to disk. If that
 		 * weren't true, we'd skip writing those itxs out, and
 		 * would break the semantics of zil_commit(); thus, we're
 		 * verifying that truth before we return to the caller.
 		 */
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
 		return;
 	}
 
 	/*
 	 * If the ZIL is suspended, we don't want to dirty it by calling
 	 * zil_commit_itx_assign() below, nor can we write out
 	 * lwbs like would be done in zil_commit_write(). Thus, we
 	 * simply rely on txg_wait_synced() to maintain the necessary
 	 * semantics, and avoid calling those functions altogether.
 	 */
 	if (zilog->zl_suspend > 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 		return;
 	}
 
 	zil_commit_impl(zilog, foid);
 }
 
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
 	ZIL_STAT_BUMP(zilog, zil_commit_count);
 
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
 	 * queues, such that they will be later committed (or skipped)
 	 * to an lwb when zil_process_commit_list() is called.
 	 *
 	 * Since these "async" itxs must be committed prior to this
 	 * call to zil_commit returning, we must perform this operation
 	 * before we call zil_commit_itx_assign().
 	 */
 	zil_async_to_sync(zilog, foid);
 
 	/*
 	 * We allocate a new "waiter" structure which will initially be
 	 * linked to the commit itx using the itx's "itx_private" field.
 	 * Since the commit itx doesn't represent any on-disk state,
 	 * when it's committed to an lwb, rather than copying the its
 	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
 	 * added to the lwb's list of waiters. Then, when the lwb is
 	 * committed to stable storage, each waiter in the lwb's list of
 	 * waiters will be marked "done", and signalled.
 	 *
 	 * We must create the waiter and assign the commit itx prior to
 	 * calling zil_commit_writer(), or else our specific commit itx
 	 * is not guaranteed to be committed to an lwb prior to calling
 	 * zil_commit_waiter().
 	 */
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
 	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
 		 * relying on spa_sync() to write out the data this
 		 * thread is waiting on. Obviously this has performance
 		 * implications, but the expectation is for this to be
 		 * an exceptional case, and shouldn't occur often.
 		 */
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 	} else if (wtxg != 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	zil_lwb_flush_wait_all(zilog, txg);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 		memset(zh, 0, sizeof (zil_header_t));
 		memset(zilog->zl_replayed_seq, 0,
 		    sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		} else {
 			/*
 			 * A destroyed ZIL chain can't contain any TX_SETSAXATTR
 			 * records. So, deactivate the feature for this dataset.
 			 * We activate it again when we start a new ZIL chain.
 			 */
 			if (dsl_dataset_feature_is_active(ds,
 			    SPA_FEATURE_ZILSAXATTR))
 				dsl_dataset_deactivate_feature(ds,
 				    SPA_FEATURE_ZILSAXATTR, tx);
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
 		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		if (!BP_IS_HOLE(&lwb->lwb_blk))
 			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
 	mutex_exit(&zilog->zl_lock);
 }
 
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
 
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	zil_sums_init(&zil_sums_global);
 	zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zil_kstats_global != NULL) {
 		zil_kstats_global->ks_data = &zil_stats;
 		zil_kstats_global->ks_update = zil_kstats_global_update;
 		zil_kstats_global->ks_private = NULL;
 		kstat_install(zil_kstats_global);
 	}
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
 	if (zil_kstats_global != NULL) {
 		kstat_delete(zil_kstats_global);
 		zil_kstats_global = NULL;
 	}
 
 	zil_sums_fini(&zil_sums_global);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
 	zilog->zl_max_block_size = zil_maxblocksize;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
 	mutex_destroy(&zilog->zl_lwb_io_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
 	cv_destroy(&zilog->zl_lwb_io_cv);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT3P(zilog->zl_get_data, ==, NULL);
 	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 	zilog->zl_sums = zil_sums;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg;
 
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		txg = MAX(txg, lwb->lwb_alloc_txg);
 		txg = MAX(txg, lwb->lwb_max_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
 	 * on the time when the dmu_tx transaction is assigned in
 	 * zil_lwb_write_issue().
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 	/*
 	 * We need to use txg_wait_synced() to wait until that txg is synced.
 	 * zil_sync() will guarantee all lwbs up to that txg have been
 	 * written out, flushed, and cleaned.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog,
 		    (u_longlong_t)txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static const char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	/*
 	 * The ZIL has work to do. Ensure that the associated encryption
 	 * key will remain mapped while we are committing the log by
 	 * grabbing a reference to it. If the key isn't loaded we have no
 	 * choice but to return an error until the wrapping key is loaded.
 	 */
 	if (os->os_encrypted &&
 	    dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
 		zilog->zl_suspend--;
 		mutex_exit(&zilog->zl_lock);
 		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 		return (SET_ERROR(EACCES));
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
 	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
 	 * LWB_STATE_FLUSH_DONE before returning.
 	 */
 	zil_commit_impl(zilog, 0);
 
 	/*
 	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
 	 * use txg_wait_synced() to ensure the data from the zilog has
 	 * migrated to the main pool before calling zil_destroy().
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (os->os_encrypted)
 		dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t *const *zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
     uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	memcpy(zr->zr_lr, lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 static int
 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	(void) bp, (void) arg, (void) claim_txg;
 
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *const replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		return (zil_destroy(zilog, B_TRUE));
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg, B_TRUE);
 	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 
 	return (B_TRUE);
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 zil_reset(const char *osname, void *arg)
 {
 	(void) arg;
 
 	int error = zil_suspend(osname, NULL);
 	/* EACCES means crypto key not loaded */
 	if ((error == EACCES) || (error == EBUSY))
 		return (SET_ERROR(error));
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
 
 EXPORT_SYMBOL(zil_alloc);
 EXPORT_SYMBOL(zil_free);
 EXPORT_SYMBOL(zil_open);
 EXPORT_SYMBOL(zil_close);
 EXPORT_SYMBOL(zil_replay);
 EXPORT_SYMBOL(zil_replaying);
 EXPORT_SYMBOL(zil_destroy);
 EXPORT_SYMBOL(zil_destroy_sync);
 EXPORT_SYMBOL(zil_itx_create);
 EXPORT_SYMBOL(zil_itx_destroy);
 EXPORT_SYMBOL(zil_itx_assign);
 EXPORT_SYMBOL(zil_commit);
 EXPORT_SYMBOL(zil_claim);
 EXPORT_SYMBOL(zil_check_log_chain);
 EXPORT_SYMBOL(zil_sync);
 EXPORT_SYMBOL(zil_clean);
 EXPORT_SYMBOL(zil_suspend);
 EXPORT_SYMBOL(zil_resume);
 EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
 EXPORT_SYMBOL(zil_sums_init);
 EXPORT_SYMBOL(zil_sums_fini);
 EXPORT_SYMBOL(zil_kstat_values_update);
 
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
 	"Disable ZIL cache flushes");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
 	"Limit in bytes slog sync writes per commit");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
 	"Limit in bytes WR_COPIED size");
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 2f5b423ee72e..191166b855f1 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -1,5206 +1,5200 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, Datto, Inc.
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/dsl_crypt.h>
 #include <cityhash.h>
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *const zio_type_name[ZIO_TYPES] = {
 	/*
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
 	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
 static int zio_deadman_log_all = B_FALSE;
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 static kmem_cache_t *zio_cache;
 static kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif
 
 /* Mark IOs as "slow" if they take longer than 30 seconds */
 static uint_t zio_slow_io_ms = (30 * MILLISEC);
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  *
  * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
  * compression (including of metadata).  In practice, we don't have this
  * many sync passes, so this has no effect.
  *
  * The original intent was that disabling compression would help the sync
  * passes to converge. However, in practice disabling compression increases
  * the average number of sync passes, because when we turn compression off, a
  * lot of block's size will change and thus we have to re-allocate (not
  * overwrite) them. It also increases the number of 128KB allocations (e.g.
  * for indirect blocks and spacemaps) because these will not be compressed.
  * The 128K allocations are especially detrimental to performance on highly
  * fragmented systems, which may have very few free segments of this size,
  * and may need to load new metaslabs to satisfy 128K allocations.
  */
 
 /* defer frees starting in this pass */
 uint_t zfs_sync_pass_deferred_free = 2;
 
 /* don't compress starting in this pass */
 static uint_t zfs_sync_pass_dont_compress = 8;
 
 /* rewrite new bps starting in this pass */
 static uint_t zfs_sync_pass_rewrite = 2;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 /*
  * Enable smaller cores by excluding metadata
  * allocations as well.
  */
 int zio_exclude_metadata = 0;
 static int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
 static const int zio_buf_debug_limit = 16384;
 #else
 static const int zio_buf_debug_limit = 0;
 #endif
 
 static inline void __zio_execute(zio_t *zio);
 
 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 void
 zio_init(void)
 {
 	size_t c;
 
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t align, cflags, data_cflags;
 		char name[32];
 
 		/*
 		 * Create cache for each half-power of 2 size, starting from
 		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
 		 * of ~7/8, sufficient for transient allocations mostly using
 		 * these caches.
 		 */
 		size_t p2 = size;
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 		if (!IS_P2ALIGNED(size, p2 / 2))
 			continue;
 
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 #endif
 
 		if (IS_P2ALIGNED(size, PAGESIZE))
 			align = PAGESIZE;
 		else
 			align = 1 << (highbit64(size ^ (size - 1)) - 1);
 
 		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
 		    KMC_NODEBUG : 0;
 		data_cflags = KMC_NODEBUG;
 		if (cflags == data_cflags) {
 			/*
 			 * Resulting kmem caches would be identical.
 			 * Save memory by creating only one.
 			 */
 			(void) snprintf(name, sizeof (name),
 			    "zio_buf_comb_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size, align,
 			    NULL, NULL, NULL, NULL, NULL, cflags);
 			zio_data_buf_cache[c] = zio_buf_cache[c];
 			continue;
 		}
 		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
 		    (ulong_t)size);
 		zio_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, cflags);
 
 		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
 		    (ulong_t)size);
 		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
 	zio_inject_init();
 
 	lz4_init();
 }
 
 void
 zio_fini(void)
 {
 	size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
 
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	for (size_t i = 0; i < n; i++) {
 		if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
 			(void) printf("zio_fini: [%d] %llu != %llu\n",
 			    (int)((i + 1) << SPA_MINBLOCKSHIFT),
 			    (long long unsigned)zio_buf_cache_allocs[i],
 			    (long long unsigned)zio_buf_cache_frees[i]);
 	}
 #endif
 
 	/*
 	 * The same kmem cache can show up multiple times in both zio_buf_cache
 	 * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
 	 * sort it out.
 	 */
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_buf_cache[j])
 				zio_buf_cache[j] = NULL;
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_data_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		VERIFY3P(zio_buf_cache[i], ==, NULL);
 		VERIFY3P(zio_data_buf_cache[i], ==, NULL);
 	}
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	lz4_fini();
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif
 
 	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 
 static void
 zio_abd_free(void *abd, size_t size)
 {
 	(void) size;
 	abd_free((abd_t *)abd);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 void
 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_abd = data;
 	zio->io_size = size;
 }
 
 void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			abd_free(zio->io_abd);
 
 		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks, decompression, and decryption
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		abd_copy(data, zio->io_abd, size);
 }
 
 static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
 		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 		    zio->io_abd, tmp, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
 		abd_return_buf_copy(data, tmp, size);
 
 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
 
 		if (ret != 0)
 			zio->io_error = SET_ERROR(EIO);
 	}
 }
 
 static void
 zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 {
 	int ret;
 	void *tmp;
 	blkptr_t *bp = zio->io_bp;
 	spa_t *spa = zio->io_spa;
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	uint64_t lsize = BP_GET_LSIZE(bp);
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(size, !=, 0);
 
 	if (zio->io_error != 0)
 		return;
 
 	/*
 	 * Verify the cksum of MACs stored in an indirect bp. It will always
 	 * be possible to verify this since it does not require an encryption
 	 * key.
 	 */
 	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 			/*
 			 * We haven't decompressed the data yet, but
 			 * zio_crypt_do_indirect_mac_checksum() requires
 			 * decompressed data to be able to parse out the MACs
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
 			tmp = zio_buf_alloc(lsize);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
 			    zio->io_abd, tmp, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
 			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
 			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
 			zio_buf_free(tmp, lsize);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
 			ret = zio_handle_decrypt_injection(spa,
 			    &zio->io_bookmark, ot, ECKSUM);
 		}
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	/*
 	 * If this is an authenticated block, just check the MAC. It would be
 	 * nice to separate this out into its own flag, but when this was done,
 	 * we had run out of bits in what is now zio_flag_t. Future cleanup
 	 * could make this a flag bit.
 	 */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		if (ot == DMU_OT_OBJSET) {
 			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
 			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
 		} else {
 			zio_crypt_decode_mac_bp(bp, mac);
 			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
 			    zio->io_abd, size, mac);
 			if (zio_injection_enabled && ret == 0) {
 				ret = zio_handle_decrypt_injection(spa,
 				    &zio->io_bookmark, ot, ECKSUM);
 			}
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	zio_crypt_decode_params_bp(bp, salt, iv);
 
 	if (ot == DMU_OT_INTENT_LOG) {
 		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
 		zio_crypt_decode_mac_zil(tmp, mac);
 		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
 	} else {
 		zio_crypt_decode_mac_bp(bp, mac);
 	}
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
 	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
 	    zio->io_abd, &no_crypt);
 	if (no_crypt)
 		abd_copy(data, zio->io_abd, size);
 
 	if (ret != 0)
 		goto error;
 
 	return;
 
 error:
 	/* assert that the key was found unless this was speculative */
 	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	/*
 	 * If there was a decryption / authentication error return EIO as
 	 * the io_error. If this was not a speculative zio, create an ereport.
 	 */
 	if (ret == ECKSUM) {
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
 			    &zio->io_bp->blk_birth);
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	} else {
 		zio->io_error = ret;
 	}
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 zio_t *
 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
 	list_t *pl = &cio->io_parent_list;
 
 	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_child == cio);
 	return ((*zl)->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
 	list_t *cl = &pio->io_child_list;
 
 	ASSERT(MUTEX_HELD(&pio->io_lock));
 
 	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_parent == pio);
 	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_link_t *zl = NULL;
 	zio_t *pio = zio_walk_parents(cio, &zl);
 
 	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 }
 
 void
 zio_add_child_first(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	ASSERT(list_is_empty(&cio->io_parent_list));
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
 		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
 			continue;
 
 		uint64_t *countp = &zio->io_children[c][wait];
 		if (*countp != 0) {
 			zio->io_stage >>= 1;
 			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 			zio->io_stall = countp;
 			waiting = B_TRUE;
 			break;
 		}
 	}
 	mutex_exit(&zio->io_lock);
 	return (waiting);
 }
 
 __attribute__((always_inline))
 static inline void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
     zio_t **next_to_executep)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		zio_taskq_type_t type =
 		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 
 		/*
 		 * If we can tell the caller to execute this parent next, do
 		 * so. We only do this if the parent's zio type matches the
 		 * child's type. Otherwise dispatch the parent zio in its
 		 * own taskq.
 		 *
 		 * Having the caller execute the parent when possible reduces
 		 * locking on the zio taskq's, reduces context switch
 		 * overhead, and has no recursion penalty.  Note that one
 		 * read from disk typically causes at least 3 zio's: a
 		 * zio_null(), the logical zio_read(), and then a physical
 		 * zio.  When the physical ZIO completes, we are able to call
 		 * zio_done() on all 3 of these zio's from one invocation of
 		 * zio_execute() by returning the parent back to
 		 * zio_execute().  Since the parent isn't executed until this
 		 * thread returns back to zio_execute(), the caller should do
 		 * so promptly.
 		 *
 		 * In other cases, dispatching the parent prevents
 		 * overflowing the stack when we have deeply nested
 		 * parent-child relationships, as we do with the "mega zio"
 		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
 		if (next_to_executep != NULL && *next_to_executep == NULL &&
 		    pio->io_type == zio->io_type) {
 			*next_to_executep = pio;
 		} else {
 			zio_taskq_dispatch(pio, type, B_FALSE);
 		}
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 int
 zio_bookmark_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 		return (-1);
 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 		return (1);
 
 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 		return (-1);
 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 		return (1);
 
 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 		return (-1);
 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 		return (1);
 
 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 		return (-1);
 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage,
     enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	memset(zio, 0, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT) {
 			zio->io_bp_copy = *bp;
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		} else {
 			zio->io_bp = (blkptr_t *)bp;
 		}
 		zio->io_bp_orig = *bp;
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
 	    (pipeline & ZIO_STAGE_READY) == 0;
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child_first(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
 
 	return (zio);
 }
 
 void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 /*
  * ZIO intended to be between others.  Provides synchronization at READY
  * and DONE pipeline stages and calls the respective callbacks.
  */
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 /*
  * ZIO intended to be a root of a tree.  Unlike null ZIO does not have a
  * READY pipeline stage (is ready on creation), so it should not be used
  * as child of any ZIO that may need waiting for grandchildren READY stage
  * (any other ZIO type).
  */
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
 
 	return (zio);
 }
 
 static int
 zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
     enum blk_verify_flag blk_verify, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("bad blkptr at %px: "
 	    "DVA[0]=%#llx/%#llx "
 	    "DVA[1]=%#llx/%#llx "
 	    "DVA[2]=%#llx/%#llx "
 	    "prop=%#llx "
 	    "pad=%#llx,%#llx "
 	    "phys_birth=%#llx "
 	    "birth=%#llx "
 	    "fill=%#llx "
 	    "cksum=%#llx/%#llx/%#llx/%#llx",
 	    bp,
 	    (long long)bp->blk_dva[0].dva_word[0],
 	    (long long)bp->blk_dva[0].dva_word[1],
 	    (long long)bp->blk_dva[1].dva_word[0],
 	    (long long)bp->blk_dva[1].dva_word[1],
 	    (long long)bp->blk_dva[2].dva_word[0],
 	    (long long)bp->blk_dva[2].dva_word[1],
 	    (long long)bp->blk_prop,
 	    (long long)bp->blk_pad[0],
 	    (long long)bp->blk_pad[1],
 	    (long long)bp->blk_phys_birth,
 	    (long long)bp->blk_birth,
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
 	    (long long)bp->blk_cksum.zc_word[1],
 	    (long long)bp->blk_cksum.zc_word[2],
 	    (long long)bp->blk_cksum.zc_word[3]);
 	switch (blk_verify) {
 	case BLK_VERIFY_HALT:
 		zfs_panic_recover("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_LOG:
 		zfs_dbgmsg("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_ONLY:
 		break;
 	}
 
 	return (1);
 }
 
 /*
  * Verify the block pointer fields contain reasonable values.  This means
  * it only contains known object types, checksum/compression identifiers,
  * block sizes within the maximum allowed limits, valid DVAs, etc.
  *
  * If everything checks out B_TRUE is returned.  The zfs_blkptr_verify
  * argument controls the behavior when an invalid field is detected.
  *
  * Values for blk_verify_flag:
  *   BLK_VERIFY_ONLY: evaluate the block
  *   BLK_VERIFY_LOG: evaluate the block and log problems
  *   BLK_VERIFY_HALT: call zfs_panic_recover on error
  *
  * Values for blk_config_flag:
  *   BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
  *   BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
  *   obtained for reader
  *   BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
  *   performance
  */
 boolean_t
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
 {
 	int errors = 0;
 
 	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 	}
 
 	/*
 	 * Do not verify individual DVAs if the config is not trusted. This
 	 * will be done once the zio is executed in vdev_mirror_map_alloc.
 	 */
 	if (!spa->spa_trust_config)
 		return (errors == 0);
 
 	switch (blk_config) {
 	case BLK_CONFIG_HELD:
 		ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
 		break;
 	case BLK_CONFIG_NEEDED:
 		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
 		break;
 	case BLK_CONFIG_SKIP:
 		return (errors == 0);
 	default:
 		panic("invalid blk_config %u", blk_config);
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the blk_birth and
 	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
 	 * that are in the log) to be arbitrarily large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
 		uint64_t vdevid = DVA_GET_VDEV(dva);
 
 		if (vdevid >= spa->spa_root_vdev->vdev_children) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (vd == NULL) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_hole_ops) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has hole VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(dva);
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		if (DVA_GET_GANG(dva))
 			asize = vdev_gang_header_asize(vd);
 		if (offset + asize > vd->vdev_asize) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 	if (blk_config == BLK_CONFIG_NEEDED)
 		spa_config_exit(spa, SCL_VDEV, bp);
 
 	return (errors == 0);
 }
 
 boolean_t
 zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
 {
 	(void) bp;
 	uint64_t vdevid = DVA_GET_VDEV(dva);
 
 	if (vdevid >= spa->spa_root_vdev->vdev_children)
 		return (B_FALSE);
 
 	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 	if (vd == NULL)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_hole_ops)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_missing_ops) {
 		return (B_FALSE);
 	}
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t asize = DVA_GET_ASIZE(dva);
 
 	if (DVA_GET_GANG(dva))
 		asize = vdev_gang_header_asize(vd);
 	if (offset + asize > vd->vdev_asize)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
 	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim). Encrypted
 	 * dedup blocks need data as well so we also disable dedup in this
 	 * case.
 	 */
 	if (data == NULL &&
 	    (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
     boolean_t brtwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 	ASSERT(!brtwrite || !nopwrite);
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_brtwrite = brtwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	(void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 *
 	 * Note that we only defer frees after zfs_sync_pass_deferred_free
 	 * when the log space map feature is disabled. [see relevant comment
 	 * in spa_sync_iterate_to_convergence()]
 	 */
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
 	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
 	    brt_maybe_exists(spa, bp)) {
 		metaslab_check_free(spa, bp);
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
 	}
 }
 
 /*
  * To improve performance, this function may return NULL if we were able
  * to do the free immediately.  This avoids the cost of creating a zio
  * (and linking it to the parent, etc).
  */
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_flag_t flags)
 {
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (NULL);
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    brt_maybe_exists(spa, bp)) {
 		/*
 		 * GANG, DEDUP and BRT blocks can induce a read (for the gang
 		 * block header, the DDT or the BRT), so issue them
 		 * asynchronously so that this thread is not tied up.
 		 */
 		enum zio_stage stage =
 		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
 
 		return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 		    BP_GET_PSIZE(bp), NULL, NULL,
 		    ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
 		    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
 	} else {
 		metaslab_free(spa, bp, txg, B_FALSE);
 		return (NULL);
 	}
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	(void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
 	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
-	zio_t *zio;
-	int c;
-
-	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
-		zio->io_cmd = cmd;
-	} else {
-		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    done, private, flags));
-	}
-
+	zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+	    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+	zio->io_cmd = cmd;
 	return (zio);
 }
 
 zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags)
 {
 	zio_t *zio;
 
 	ASSERT0(vd->vdev_children);
 	ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	ASSERT3U(size, !=, 0);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
 	    private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
 	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
 	zio->io_trim_flags = trim_flags;
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		abd_t *wbuf = abd_alloc_sametype(data, size);
 		abd_copy(wbuf, data, size);
 
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	/*
 	 * vdev child I/Os do not propagate their error to the parent.
 	 * Therefore, for correct operation the caller *must* check for
 	 * and handle the error in the child i/o's done callback.
 	 * The only exceptions are i/os that we don't care about
 	 * (OPTIONAL or REPAIR).
 	 */
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		ASSERT0(vd->vdev_children);
 		offset += VDEV_LABEL_START_SIZE;
 	}
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio);
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
 	 * If we're creating a child I/O that is not associated with a
 	 * top-level vdev, then the child zio is not an allocating I/O.
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
 	if (flags & ZIO_FLAG_IO_ALLOCATING &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		ASSERT(pio->io_metaslab_class != NULL);
 		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
 		ASSERT(type == ZIO_TYPE_WRITE);
 		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     zio_type_t type, zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 void
-zio_flush(zio_t *zio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd)
 {
-	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	if (vd->vdev_nowritecache)
+		return;
+	if (vd->vdev_children == 0) {
+		zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
+		    DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	} else {
+		for (uint64_t c = 0; c < vd->vdev_children; c++)
+			zio_flush(pio, vd->vdev_child[c]);
+	}
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
 	ASSERT3P(zio->io_executor, ==, NULL);
 	ASSERT3U(zio->io_orig_size, ==, zio->io_size);
 	ASSERT3U(size, <=, zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp)) {
 		/* we are not doing a raw write */
 		ASSERT3U(zio->io_size, ==, zio->io_lsize);
 		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
 	}
 }
 
 /*
  * Round provided allocation size up to a value that can be allocated
  * by at least some vdev(s) in the pool with minimum or no additional
  * padding and without extra space usage on others
  */
 static uint64_t
 zio_roundup_alloc_size(spa_t *spa, uint64_t size)
 {
 	if (size > spa->spa_min_alloc)
 		return (roundup(size, spa->spa_gcd_alloc));
 	return (spa->spa_min_alloc);
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize =
 	    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decompress);
 	}
 
 	if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
 	    BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decrypt);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		int psize = BPE_GET_PSIZE(bp);
 		void *data = abd_borrow_buf(zio->io_abd, psize);
 
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		decode_embedded_bp_compressed(bp, data);
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
 		ASSERT(bp->blk_birth != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zp->zp_brtwrite)
 			return (zio);
 
 		ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
 
 		if (BP_IS_EMBEDDED(bp))
 			return (zio);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (zio);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (zio);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
 		    !zp->zp_encrypt) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (zio);
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
 	uint32_t pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	if (zio->io_children_ready != NULL) {
 		/*
 		 * Now that all our children are ready, run the callback
 		 * associated with this zio in case it wants to modify the
 		 * data to be written.
 		 */
 		ASSERT3U(zp->zp_level, >, 0);
 		zio->io_children_ready(zio);
 	}
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
 		    MIN(zp->zp_copies, spa_max_replication(spa))
 		    == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		void *cbuf = NULL;
 		psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize,
 		    zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			if (cbuf != NULL)
 				zio_buf_free(cbuf, lsize);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
 			bp->blk_birth = zio->io_txg;
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (zio);
 		} else {
 			/*
 			 * Round compressed size up to the minimum allocation
 			 * size of the smallest-ashift device, and zero the
 			 * tail. This ensures that the compressed size of the
 			 * BP (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
 				psize = lsize;
 			} else {
 				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
 				abd_take_ownership_of_buf(cdata, B_TRUE);
 				abd_zero_off(cdata, psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cdata,
 				    psize, lsize, NULL);
 			}
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 
 	} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
 	    zp->zp_type == DMU_OT_DNODE) {
 		/*
 		 * The DMU actually relies on the zio layer's compression
 		 * to free metadnode blocks that have had all contained
 		 * dnodes freed. As a result, even when doing a raw
 		 * receive, we must check whether the block can be compressed
 		 * to a hole.
 		 */
 		psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
 		    zio->io_abd, NULL, lsize, zp->zp_complevel);
 		if (psize == 0 || psize >= lsize)
 			compress = ZIO_COMPRESS_OFF;
 	} else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) {
 		/*
 		 * If we are raw receiving an encrypted dataset we should not
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
 		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
 		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
 			abd_zero_off(cdata, psize, rounded - psize);
 			abd_copy_off(cdata, zio->io_abd, 0, 0, psize);
 			psize = rounded;
 			zio_push_transform(zio, cdata,
 			    psize, rounded, NULL);
 		}
 	} else {
 		ASSERT3U(psize, !=, 0);
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
 		if (zio->io_bp_orig.blk_birth != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			ASSERT(!zp->zp_encrypt ||
 			    DMU_OT_IS_ENCRYPTED(zp->zp_type));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 	int flags = (cutinline ? TQ_FRONT : 0);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available.
 	 */
 	if ((zio->io_priority == ZIO_PRIORITY_NOW ||
 	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
 	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
 	    &zio->io_tqent, zio);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	spa_t *spa = zio->io_spa;
 
 	taskq_t *tq = taskq_of_curthread();
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (tqs->stqs_taskq[i] == tq)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
 	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 	return (NULL);
 }
 
 void
 zio_interrupt(void *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			taskqid_t tid;
 			hrtime_t diff = zio->io_target_timestamp - now;
 			clock_t expire_at_tick = ddi_get_lbolt() +
 			    NSEC_TO_TICK(diff);
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			if (NSEC_TO_TICK(diff) == 0) {
 				/* Our delay is less than a jiffy - just spin */
 				zfs_sleep_until(zio->io_target_timestamp);
 				zio_interrupt(zio);
 			} else {
 				/*
 				 * Use taskq_dispatch_delay() in the place of
 				 * OpenZFS's timeout_generic().
 				 */
 				tid = taskq_dispatch_delay(system_taskq,
 				    zio_interrupt, zio, TQ_NOSLEEP,
 				    expire_at_tick);
 				if (tid == TASKQID_INVALID) {
 					/*
 					 * Couldn't allocate a task.  Just
 					 * finish the zio without a delay.
 					 */
 					zio_interrupt(zio);
 				}
 			}
 		}
 		return;
 	}
 #endif
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 static void
 zio_deadman_impl(zio_t *pio, int ziodepth)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 	vdev_t *vd = pio->io_vd;
 
 	if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
 		vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
 		zbookmark_phys_t *zb = &pio->io_bookmark;
 		uint64_t delta = gethrtime() - pio->io_timestamp;
 		uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
 
 		zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
 		    "delta=%llu queued=%llu io=%llu "
 		    "path=%s "
 		    "last=%llu type=%d "
 		    "priority=%d flags=0x%llx stage=0x%x "
 		    "pipeline=0x%x pipeline-trace=0x%x "
 		    "objset=%llu object=%llu "
 		    "level=%llu blkid=%llu "
 		    "offset=%llu size=%llu "
 		    "error=%d",
 		    ziodepth, pio, pio->io_timestamp,
 		    (u_longlong_t)delta, pio->io_delta, pio->io_delay,
 		    vd ? vd->vdev_path : "NULL",
 		    vq ? vq->vq_io_complete_ts : 0, pio->io_type,
 		    pio->io_priority, (u_longlong_t)pio->io_flags,
 		    pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
 		    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid,
 		    (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size,
 		    pio->io_error);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
 		    pio->io_spa, vd, zb, pio, 0);
 
 		if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
 		    taskq_empty_ent(&pio->io_tqent)) {
 			zio_interrupt(pio);
 		}
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_deadman_impl(cio, ziodepth + 1);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Log the critical information describing this zio and all of its children
  * using the zfs_dbgmsg() interface then post deadman event for the ZED.
  */
 void
 zio_deadman(zio_t *pio, const char *tag)
 {
 	spa_t *spa = pio->io_spa;
 	char *name = spa_name(spa);
 
 	if (!zfs_deadman_enabled || spa_suspended(spa))
 		return;
 
 	zio_deadman_impl(pio, 0);
 
 	switch (spa_get_deadman_failmode(spa)) {
 	case ZIO_FAILURE_MODE_WAIT:
 		zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_CONTINUE:
 		zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_PANIC:
 		fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
 		break;
 	}
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  * (1) the I/O completes; (2) the pipeline stalls waiting for
  * dependent child I/Os; (3) the I/O issues, so we're waiting
  * for an I/O completion interrupt; (4) the I/O is delegated by
  * vdev-level caching or aggregation; (5) the I/O is deferred
  * due to vdev-level queueing; (6) the I/O is handed off to
  * another thread.  In all cases, the pipeline stops whenever
  * there's no CPU work; it never burns a thread in cv_wait_io().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 /*
  * zio_execute() is a wrapper around the static function
  * __zio_execute() so that we can force  __zio_execute() to be
  * inlined.  This reduces stack overhead which is important
  * because __zio_execute() is called recursively in several zio
  * code paths.  zio_execute() itself cannot be inlined because
  * it is externally visible.
  */
 void
 zio_execute(void *zio)
 {
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	__zio_execute(zio);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * Used to determine if in the current context the stack is sized large
  * enough to allow zio_execute() to be called recursively.  A minimum
  * stack size of 16K is required to avoid needing to re-dispatch the zio.
  */
 static boolean_t
 zio_execute_stack_check(zio_t *zio)
 {
 #if !defined(HAVE_LARGE_STACKS)
 	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
 
 	/* Executing in txg_sync_thread() context. */
 	if (dp && curthread == dp->dp_tx.tx_sync_thread)
 		return (B_TRUE);
 
 	/* Pool initialization outside of zio_taskq context. */
 	if (dp && spa_is_initializing(dp->dp_spa) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
 		return (B_TRUE);
 #else
 	(void) zio;
 #endif /* HAVE_LARGE_STACKS */
 
 	return (B_FALSE);
 }
 
 __attribute__((always_inline))
 static inline void
 __zio_execute(zio_t *zio)
 {
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 
 		zio->io_executor = curthread;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		/*
 		 * If the current context doesn't have large enough stacks
 		 * the zio must be issued asynchronously to prevent overflow.
 		 */
 		if (zio_execute_stack_check(zio)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
 
 		/*
 		 * The zio pipeline stage returns the next zio to execute
 		 * (typically the same as this one), or NULL if we should
 		 * stop.
 		 */
 		zio = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (zio == NULL)
 			return;
 	}
 }
 
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	/*
 	 * Some routines, like zio_free_sync(), may return a NULL zio
 	 * to avoid the performance overhead of creating and then destroying
 	 * an unneeded zio.  For the callers' simplicity, we accept a NULL
 	 * zio and ignore it.
 	 */
 	if (zio == NULL)
 		return (0);
 
 	long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
 	int error;
 
 	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	zio->io_waiter = curthread;
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL) {
 		error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
 		    ddi_get_lbolt() + timeout);
 
 		if (zfs_deadman_enabled && error == -1 &&
 		    gethrtime() - zio->io_queued_timestamp >
 		    spa_deadman_ziotime(zio->io_spa)) {
 			mutex_exit(&zio->io_lock);
 			timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
 			zio_deadman(zio, FTAG);
 			mutex_enter(&zio->io_lock);
 		}
 	}
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
 	/*
 	 * See comment in zio_wait().
 	 */
 	if (zio == NULL)
 		return;
 
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    list_is_empty(&zio->io_parent_list)) {
 		zio_t *pio;
 
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 		pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
 
 		zio_add_child(pio, zio);
 	}
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 }
 
 /*
  * ==========================================================================
  * Reexecute, cancel, or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(void *arg)
 {
 	zio_t *pio = arg;
 	zio_t *cio, *cio_next, *gio;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	mutex_enter(&pio->io_lock);
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0;
 	pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
 	zio_link_t *zl = NULL;
 	while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
 			gio->io_children[pio->io_child_type][w] +=
 			    !pio->io_state[w];
 		}
 	}
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 		mutex_enter(&pio->io_lock);
 	}
 	mutex_exit(&pio->io_lock);
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on it.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
 		pio->io_queued_timestamp = gethrtime();
 		__zio_execute(pio);
 	}
 }
 
 void
 zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
 	    "failure and has been suspended.\n", spa_name(spa));
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = reason;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	spa->spa_suspended = ZIO_SUSPEND_NONE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static void
 zio_gang_issue_func_done(zio_t *zio)
 {
 	abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
 	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
 	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 static zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
 		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			abd_t *buf = abd_get_offset(data, offset);
 
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    buf, BP_GET_PSIZE(bp));
 
 			abd_free(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
 		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 
 	zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio));
 	if (zio == NULL) {
 		zio = zio_null(pio, pio->io_spa,
 		    NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(list_is_empty(&zio->io_child_list));
 
 	if (zio->io_error)
 		return;
 
 	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	abd_free(zio->io_abd);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
     uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
 			    offset);
 			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree)
 		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static zio_t *
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (zio);
 }
 
 static zio_t *
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
 		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 static void
 zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
 {
 	cio->io_allocator = pio->io_allocator;
 	cio->io_wr_iss_tq = pio->io_wr_iss_tq;
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 	zio_t *gio __maybe_unused = zio->io_gang_leader;
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_write_gang_done(zio_t *zio)
 {
 	/*
 	 * The io_abd field will be NULL for a zio with no data.  The io_flags
 	 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
 	 * check for it here as it is cleared in zio_ready.
 	 */
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
 	zio_prop_t zp;
 	int error;
 	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
 
 	/*
 	 * If one copy was requested, store 2 copies of the GBH, so that we
 	 * can still traverse all the data (e.g. to free or scrub) even if a
 	 * block is damaged.  Note that we can't store 3 copies of the GBH in
 	 * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
 	 */
 	int gbh_copies = copies;
 	if (gbh_copies == 1) {
 		gbh_copies = MIN(2, spa_max_replication(spa));
 	}
 
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(has_data);
 
 		flags |= METASLAB_ASYNC_ALLOC;
 		VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
 		    mca_alloc_slots, pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
 		 * 'copies' allocation slots but gang blocks may require
 		 * additional copies. These additional copies
 		 * (i.e. gbh_copies - copies) are guaranteed to succeed
 		 * since metaslab_class_throttle_reserve() always allows
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
 		    pio->io_allocator, pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
 	    &pio->io_alloc_list, pio, pio->io_allocator);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * If we failed to allocate the gang block header then
 			 * we remove any additional allocation reservations that
 			 * we placed here. The original reservation will
 			 * be removed when the logical I/O goes to the ready
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
 			    gbh_copies - copies, pio->io_allocator, pio);
 		}
 
 		pio->io_error = error;
 		return (pio);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	memset(gbh, 0, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	zio_gang_inherit_allocator(pio, zio);
 
 	/*
 	 * Create and nowait the gang children.
 	 */
 	for (int g = 0; resid != 0; resid -= lsize, g++) {
 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_complevel = gio->io_prop.zp_complevel;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
 		memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
 		memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
 		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
 		    resid) : NULL, lsize, lsize, &zp,
 		    zio_write_gang_member_ready, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 		zio_gang_inherit_allocator(zio, cio);
 
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * Gang children won't throttle but we should
 			 * account for their work, so reserve an allocation
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
 			    zp.zp_copies, cio->io_allocator, cio, flags));
 		}
 		zio_nowait(cio);
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	zio_nowait(zio);
 
 	return (pio);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static zio_t *
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (zio);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop);
 
 		/*
 		 * If we're overwriting a block that is currently on an
 		 * indirect vdev, then ignore the nopwrite request and
 		 * allow a new block to be allocated on a concrete vdev.
 		 */
 		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
 		for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) {
 			vdev_t *tvd = vdev_lookup_top(zio->io_spa,
 			    DVA_GET_VDEV(&bp_orig->blk_dva[d]));
 			if (tvd->vdev_ops == &vdev_indirect_ops) {
 				spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 				return (zio);
 			}
 		}
 		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Block Reference Table
  * ==========================================================================
  */
 static zio_t *
 zio_brt_free(zio_t *zio)
 {
 	blkptr_t *bp;
 
 	bp = zio->io_bp;
 
 	if (BP_GET_LEVEL(bp) > 0 ||
 	    BP_IS_METADATA(bp) ||
 	    !brt_maybe_exists(zio->io_spa, bp)) {
 		return (zio);
 	}
 
 	if (!brt_entry_decref(zio->io_spa, bp)) {
 		/*
 		 * This isn't the last reference, so we cannot free
 		 * the data yet.
 		 */
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddp = ddt_phys_select(dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
 	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
 		dde->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
 static zio_t *
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_t *ddp = dde->dde_phys;
 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (ddp_self == NULL)
 			return (zio);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
 			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (zio);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (zio);
 }
 
 static zio_t *
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (zio);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
 		if (dde->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (zio);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 	boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
 
 	ASSERT(!(zio->io_bp_override && do_raw));
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 * However, we should never get a raw, override zio so in these
 	 * cases we can compare the io_abd directly. This is useful because
 	 * it allows us to do dedup verification even if we don't have access
 	 * to the original data (for instance, if the encryption keys aren't
 	 * loaded).
 	 */
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
 		} else if (lio != NULL) {
 			return (lio->io_orig_size != zio->io_orig_size ||
 			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 		}
 	}
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			tmpabd = abd_alloc_for_io(psize, B_TRUE);
 
 			error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
 			    psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_RAW, &zio->io_bookmark));
 
 			if (error == 0) {
 				if (abd_cmp(tmpabd, zio->io_abd) != 0)
 					error = SET_ERROR(ENOENT);
 			}
 
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
 		} else if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(ENOENT);
 				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
 	zio_link_t *zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		zio_link_t *zl = NULL;
 		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static zio_t *
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	int p = zp->zp_copies;
 	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 
 	ddt_enter(ddt);
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 			BP_SET_DEDUP(bp, B_FALSE);
 		}
 		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
 		if (dde->dde_lead_zio[p] != NULL)
 			zio_add_child(zio, dde->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
 		ASSERT(bp->blk_birth == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
 		    zio_ddt_child_write_ready, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);
 
 	zio_nowait(cio);
 
 	return (zio);
 }
 
 static ddt_entry_t *freedde; /* for debugging */
 
 static zio_t *
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	if (dde) {
 		ddp = ddt_phys_select(dde, bp);
 		if (ddp)
 			ddt_phys_decref(ddp);
 	}
 	ddt_exit(ddt);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
 
 	zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
 	if (zio == NULL)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
 	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
 	    zio->io_prop.zp_copies, allocator, zio, 0)) {
 		return (NULL);
 	}
 
 	avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *nio;
 	metaslab_class_t *mc;
 
 	/* locate an appropriate allocation class */
 	mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
 	    zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
 
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
 	    !mc->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
 		return (zio);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
 	int allocator = zio->io_allocator;
 	zio->io_metaslab_class = mc;
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
 	nio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	return (nio);
 }
 
 static void
 zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	zio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	if (zio == NULL)
 		return;
 
 	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
 	ASSERT0(zio->io_error);
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 }
 
 static zio_t *
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
 		flags |= METASLAB_GANG_CHILD;
 	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
 		flags |= METASLAB_ASYNC_ALLOC;
 
 	/*
 	 * if not already chosen, locate an appropriate allocation class
 	 */
 	mc = zio->io_metaslab_class;
 	if (mc == NULL) {
 		mc = spa_preferred_class(spa, zio->io_size,
 		    zio->io_prop.zp_type, zio->io_prop.zp_level,
 		    zio->io_prop.zp_zpl_smallblk);
 		zio->io_metaslab_class = mc;
 	}
 
 	/*
 	 * Try allocating the block in the usual metaslab class.
 	 * If that's full, allocate it in the normal class.
 	 * If that's full, allocate as a gang block,
 	 * and if all are full, the allocation fails (which shouldn't happen).
 	 *
 	 * Note that we do not fall back on embedded slog (ZIL) space, to
 	 * preserve unfragmented slog space, which is critical for decent
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
 
 	/*
 	 * Fallback to normal class when an alloc class is full
 	 */
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		/*
 		 * If throttling, transfer reservation over to normal class.
 		 * The io_allocator slot can remain the same even though we
 		 * are switching classes.
 		 */
 		if (mc->mc_alloc_throttle_enabled &&
 		    (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
 			metaslab_class_throttle_unreserve(mc,
 			    zio->io_prop.zp_copies, zio->io_allocator, zio);
 			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
 
 			VERIFY(metaslab_class_throttle_reserve(
 			    spa_normal_class(spa),
 			    zio->io_prop.zp_copies, zio->io_allocator, zio,
 			    flags | METASLAB_MUST_RESERVE));
 		}
 		zio->io_metaslab_class = mc = spa_normal_class(spa);
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying normal class: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 
 		error = metaslab_alloc(spa, mc, zio->io_size, bp,
 		    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 		    &zio->io_alloc_list, zio, zio->io_allocator);
 	}
 
 	if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying ganging: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		return (zio_write_gang_block(zio, mc));
 	}
 	if (error != 0) {
 		if (error != ENOSPC ||
 		    (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
 			zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
 			    "size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		zio->io_error = error;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (zio);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp))
 		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
     uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
 
 	/*
 	 * Block pointer fields are useful to metaslabs for stats and debugging.
 	 * Fill in the obvious ones before calling into metaslab_alloc().
 	 */
 	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 	BP_SET_PSIZE(new_bp, size);
 	BP_SET_LEVEL(new_bp, 0);
 
 	/*
 	 * When allocating a zil block, we don't have information about
 	 * the final destination of the block except the objset it's part
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
 	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash4(0, 0, 0,
 	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
 	*slog = (error == 0);
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 
 		/*
 		 * encrypted blocks will require an IV and salt. We generate
 		 * these now since we will not be rewriting the bp at
 		 * rewrite time.
 		 */
 		if (os->os_encrypted) {
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 
 			BP_SET_CRYPT(new_bp, B_TRUE);
 			VERIFY0(spa_crypt_get_salt(spa,
 			    dmu_objset_id(os), salt));
 			VERIFY0(zio_crypt_generate_iv(iv));
 
 			zio_crypt_encode_params_bp(new_bp, salt, iv);
 		}
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
 		    "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
 		    error);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Read and write to physical devices
  * ==========================================================================
  */
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static zio_t *
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 
 	zio->io_delay = 0;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (NULL);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		ASSERT(spa->spa_trust_config);
 
 		/*
 		 * Note: the code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
 		if (zio->io_vd->vdev_noalloc) {
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
 			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 		}
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			abd_copy(abuf, zio->io_abd, zio->io_size);
 			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For physical writes, we allow 512b aligned writes and assume
 		 * the device will perform a read-modify-write as necessary.
 		 */
 		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
 		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
 	}
 
 	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering.
 	 *
 	 * There are a few ways that we can end up creating these spurious
 	 * resilver i/os:
 	 *
 	 * 1. A resilver i/o will be issued if any DVA in the BP has a
 	 * dirty DTL.  The mirror code will issue resilver writes to
 	 * each DVA, including the one(s) that are not on vdevs with dirty
 	 * DTLs.
 	 *
 	 * 2. With nested replication, which happens when we have a
 	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
 	 * For example, given mirror(replacing(A+B), C), it's likely that
 	 * only A is out of date (it's the new device). In this case, we'll
 	 * read from C, then use the data to resilver A+B -- but we don't
 	 * actually want to resilver B, just A. The top-level mirror has no
 	 * way to know this, so instead we just discard unnecessary repairs
 	 * as we work our way down the vdev tree.
 	 *
 	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
 	 * The same logic applies to any form of nested replication: ditto
 	 * + mirror, RAID-Z + replacing, etc.
 	 *
 	 * However, indirect vdevs point off to other vdevs which may have
 	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
 	 * will be properly bypassed instead.
 	 *
 	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
 	 * a dRAID spare vdev. For example, when a dRAID spare is first
 	 * used, its spare blocks need to be written to but the leaf vdev's
 	 * of such blocks can have empty DTL_PARTIAL.
 	 *
 	 * There seemed no clean way to allow such writes while bypassing
 	 * spurious ones. At this point, just avoid all bypassing for dRAID
 	 * for correctness.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    vd->vdev_ops != &vdev_indirect_ops &&
 	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (zio);
 	}
 
 	/*
 	 * Select the next best leaf I/O to process.  Distributed spares are
 	 * excluded since they dispatch the I/O directly to a leaf vdev after
 	 * applying the dRAID mapping.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops &&
 	    (zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);
 
 		if (!vdev_accessible(vd, zio)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return (NULL);
 		}
 		zio->io_delay = gethrtime();
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (NULL);
 }
 
 static zio_t *
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
 			    EIO, EILSEQ);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (zio);
 }
 
 /*
  * This function is used to change the priority of an existing zio that is
  * currently in-flight. This is used by the arc to upgrade priority in the
  * event that a demand read is made for a block that is currently queued
  * as a scrub or async read IO. Otherwise, the high priority read request
  * would end up having to wait for the lower priority IO.
  */
 void
 zio_change_priority(zio_t *pio, zio_priority_t priority)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 
 	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_change_io_priority(pio, priority);
 	} else {
 		pio->io_priority = priority;
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_change_priority(cio, priority);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const abd_t *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
 {
 	void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
 
 	abd_copy(abd, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = abd;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_abd_free;
 }
 
 static zio_t *
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (NULL);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting "
 		    "cant_write=TRUE due to write failure with ENXIO",
 		    zio);
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
 	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
 	    zio->io_type == ZIO_TYPE_IOCTL &&
 	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Encrypt and store encryption parameters
  * ==========================================================================
  */
 
 
 /*
  * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
  * managing the storage of encryption parameters and passing them to the
  * lower-level encryption functions.
  */
 static zio_t *
 zio_encrypt(zio_t *zio)
 {
 	zio_prop_t *zp = &zio->io_prop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_GET_PSIZE(bp);
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	void *enc_buf = NULL;
 	abd_t *eabd = NULL;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/* the root zio already encrypted the data */
 	if (zio->io_child_type == ZIO_CHILD_GANG)
 		return (zio);
 
 	/* only ZIL blocks are re-encrypted on rewrite */
 	if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
 		return (zio);
 
 	if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
 		BP_SET_CRYPT(bp, B_FALSE);
 		return (zio);
 	}
 
 	/* if we are doing raw encryption set the provided encryption params */
 	if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
 		ASSERT0(BP_GET_LEVEL(bp));
 		BP_SET_CRYPT(bp, B_TRUE);
 		BP_SET_BYTEORDER(bp, zp->zp_byteorder);
 		if (ot != DMU_OT_OBJSET)
 			zio_crypt_encode_mac_bp(bp, zp->zp_mac);
 
 		/* dnode blocks must be written out in the provided byteorder */
 		if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
 		    ot == DMU_OT_DNODE) {
 			void *bswap_buf = zio_buf_alloc(psize);
 			abd_t *babd = abd_get_from_buf(bswap_buf, psize);
 
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
 			dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
 			    psize);
 
 			abd_take_ownership_of_buf(babd, B_TRUE);
 			zio_push_transform(zio, babd, psize, psize, NULL);
 		}
 
 		if (DMU_OT_IS_ENCRYPTED(ot))
 			zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
 		return (zio);
 	}
 
 	/* indirect blocks only maintain a cksum of the lower level MACs */
 	if (BP_GET_LEVEL(bp) > 0) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
 		    zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
 		    mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Objset blocks are a special case since they have 2 256-bit MACs
 	 * embedded within them.
 	 */
 	if (ot == DMU_OT_OBJSET) {
 		ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
 		ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
 		return (zio);
 	}
 
 	/* unencrypted object types are only authenticated with a MAC */
 	if (!DMU_OT_IS_ENCRYPTED(ot)) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Later passes of sync-to-convergence may decide to rewrite data
 	 * in place to avoid more disk reallocations. This presents a problem
 	 * for encryption because this constitutes rewriting the new data with
 	 * the same encryption key and IV. However, this only applies to blocks
 	 * in the MOS (particularly the spacemaps) and we do not encrypt the
 	 * MOS. We assert that the zio is allocating or an intent log write
 	 * to enforce this.
 	 */
 	ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
 	ASSERT3U(psize, !=, 0);
 
 	enc_buf = zio_buf_alloc(psize);
 	eabd = abd_get_from_buf(enc_buf, psize);
 	abd_take_ownership_of_buf(eabd, B_TRUE);
 
 	/*
 	 * For an explanation of what encryption parameters are stored
 	 * where, see the block comment in zio_crypt.c.
 	 */
 	if (ot == DMU_OT_INTENT_LOG) {
 		zio_crypt_decode_params_bp(bp, salt, iv);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 	}
 
 	/* Perform the encryption. This should not fail */
 	VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
 	    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 	    salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
 
 	/* encode encryption metadata into the bp */
 	if (ot == DMU_OT_INTENT_LOG) {
 		/*
 		 * ZIL blocks store the MAC in the embedded checksum, so the
 		 * transform must always be applied.
 		 */
 		zio_crypt_encode_mac_zil(enc_buf, mac);
 		zio_push_transform(zio, eabd, psize, psize, NULL);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 		zio_crypt_encode_params_bp(bp, salt, iv);
 		zio_crypt_encode_mac_bp(bp, mac);
 
 		if (no_crypt) {
 			ASSERT3U(ot, ==, DMU_OT_DNODE);
 			abd_free(eabd);
 		} else {
 			zio_push_transform(zio, eabd, psize, psize, NULL);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static zio_t *
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (zio);
 }
 
 static zio_t *
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
 	}
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			mutex_enter(&zio->io_vd->vdev_stat_lock);
 			zio->io_vd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&zio->io_vd->vdev_stat_lock);
 			(void) zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, &zio->io_bookmark, zio,
 			    zio->io_offset, zio->io_size, &info);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permanent.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static zio_t *
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
 		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 #ifdef ZFS_DEBUG
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 #endif
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
 			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
 			 * issue the next I/O to allocate.
 			 */
 			metaslab_class_throttle_unreserve(
 			    zio->io_metaslab_class, zio->io_prop.zp_copies,
 			    zio->io_allocator, zio);
 			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
 		}
 	}
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (bp != NULL && BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (zio);
 }
 
 /*
  * Update the allocation throttle accounting.
  */
 static void
 zio_dva_throttle_done(zio_t *zio)
 {
 	zio_t *lio __maybe_unused = zio->io_logical;
 	zio_t *pio = zio_unique_parent(zio);
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT(vd != NULL);
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
 	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that
 	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
 	 * and ones that allocated the constituent blocks. The allocation
 	 * throttle needs to know the allocating parent zio so we must find
 	 * it here.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG) {
 		/*
 		 * If our parent is a rewrite gang child then our grandparent
 		 * would have been the one that performed the allocation.
 		 */
 		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
 			pio = zio_unique_parent(pio);
 		flags |= METASLAB_GANG_CHILD;
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 	ASSERT(zio->io_metaslab_class != NULL);
 
 	mutex_enter(&pio->io_lock);
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
 	    pio->io_allocator, B_TRUE);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
 	    pio->io_allocator, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
 	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
 }
 
 static zio_t *
 zio_done(zio_t *zio)
 {
 	/*
 	 * Always attempt to keep stack usage minimal here since
 	 * we can be called recursively up to 19 levels deep.
 	 */
 	const uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
 	 * We only track child I/Os that are part of an allocating async
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV) {
 		ASSERT(zio->io_metaslab_class != NULL);
 		ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
 		zio_dva_throttle_done(zio);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, verify that
 	 * we have decremented the refcounts for every I/O that was throttled.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
 		ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
 		VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
 		    mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
 	}
 
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
 		ASSERT(zio->io_bp->blk_pad[0] == 0);
 		ASSERT(zio->io_bp->blk_pad[1] == 0);
 		ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT3U(zio->io_prop.zp_copies, <=,
 			    BP_GET_NDVAS(zio->io_bp));
 			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
 			    (BP_COUNT_GANG(zio->io_bp) ==
 			    BP_GET_NDVAS(zio->io_bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			abd_t *adata = zio->io_abd;
 
 			if (adata != NULL && asize != psize) {
 				adata = abd_alloc(asize, B_TRUE);
 				abd_copy(adata, zio->io_abd, psize);
 				abd_zero_off(adata, psize, asize - psize);
 			}
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, adata);
 			zfs_ereport_free_checksum(zcr);
 
 			if (adata != NULL && asize != psize)
 				abd_free(adata);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	/*
 	 * If this I/O is attached to a particular vdev is slow, exceeding
 	 * 30 seconds to complete, post an error described the I/O delay.
 	 * We ignore these errors if the device is currently unavailable.
 	 */
 	if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
 		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
 			/*
 			 * We want to only increment our slow IO counters if
 			 * the IO is valid (i.e. not if the drive is removed).
 			 *
 			 * zfs_ereport_post() will also do these checks, but
 			 * it can also ratelimit and have other failures, so we
 			 * need to increment the slow_io counters independent
 			 * of it.
 			 */
 			if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
 			    zio->io_spa, zio->io_vd, zio)) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				zio->io_vd->vdev_stat.vs_slow_ios++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 
 				(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
 				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
 				    zio, 0);
 			}
 		}
 	}
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
 		    !vdev_is_dead(zio->io_vd)) {
 			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
 			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 			if (ret != EALREADY) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				if (zio->io_type == ZIO_TYPE_READ)
 					zio->io_vd->vdev_stat.vs_read_errors++;
 				else if (zio->io_type == ZIO_TYPE_WRITE)
 					zio->io_vd->vdev_stat.vs_write_errors++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 			}
 		}
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
 			    &zio->io_bp->blk_birth);
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	}
 
 	if (zio->io_error && zio == zio->io_logical) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
 
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		zl = NULL;
 		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
 		    pio = pio_next) {
 			zio_link_t *remove_zl = zl;
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				/*
 				 * This is a rare code path, so we don't
 				 * bother with "next_to_execute".
 				 */
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
 				    NULL);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			/*
 			 * This is a rare code path, so we don't bother with
 			 * "next_to_execute".
 			 */
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
 			    zio_reexecute, zio, 0, &zio->io_tqent, NULL);
 		}
 		return (NULL);
 	}
 
 	ASSERT(list_is_empty(&zio->io_child_list));
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * We are done executing this zio.  We may want to execute a parent
 	 * next.  See the comment in zio_notify_parent().
 	 */
 	zio_t *next_to_execute = NULL;
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (next_to_execute);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_compress,
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
 	IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT0(last_block->zb_level);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
 
 /*
  * This function is similar to zbookmark_subtree_completed(), but returns true
  * if subtree_root is equal or ahead of last_block, i.e. still to be done.
  */
 boolean_t
 zbookmark_subtree_tbd(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	ASSERT0(last_block->zb_level);
 	if (dnp == NULL)
 		return (B_FALSE);
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root,
 	    last_block) >= 0);
 }
 
 EXPORT_SYMBOL(zio_type_name);
 EXPORT_SYMBOL(zio_buf_alloc);
 EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
 	"Max I/O completion time (milliseconds) before marking it as slow");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
 	"Prioritize requeued I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  UINT, ZMOD_RW,
 	"Defer frees starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW,
 	"Don't compress starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW,
 	"Rewrite new bps starting in this pass");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
 	"Throttle block allocations in the ZIO pipeline");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
 	"Log all slow ZIOs, not just those with vdevs");
diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib
index d5d7bb6c8360..b4d2b91dd476 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib
@@ -1,3890 +1,3896 @@
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 
 #
 # Copyright (c) 2009, Sun Microsystems Inc. All rights reserved.
 # Copyright (c) 2012, 2020, Delphix. All rights reserved.
 # Copyright (c) 2017, Tim Chase. All rights reserved.
 # Copyright (c) 2017, Nexenta Systems Inc. All rights reserved.
 # Copyright (c) 2017, Lawrence Livermore National Security LLC.
 # Copyright (c) 2017, Datto Inc. All rights reserved.
 # Copyright (c) 2017, Open-E Inc. All rights reserved.
 # Copyright (c) 2021, The FreeBSD Foundation.
 # Use is subject to license terms.
 #
 
 . ${STF_SUITE}/include/tunables.cfg
 
 . ${STF_TOOLS}/include/logapi.shlib
 . ${STF_SUITE}/include/math.shlib
 . ${STF_SUITE}/include/blkdev.shlib
 
+# On AlmaLinux 9 we will see $PWD = '.' instead of the full path.  This causes
+# some tests to fail.  Fix it up here.
+if [ "$PWD" = "." ] ; then
+	PWD="$(readlink -f $PWD)"
+fi
+
 #
 # Apply constrained path when available.  This is required since the
 # PATH may have been modified by sudo's secure_path behavior.
 #
 if [ -n "$STF_PATH" ]; then
 	export PATH="$STF_PATH"
 fi
 
 #
 # Generic dot version comparison function
 #
 # Returns success when version $1 is greater than or equal to $2.
 #
 function compare_version_gte
 {
 	[ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ]
 }
 
 # Linux kernel version comparison function
 #
 # $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
 #
 # Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
 #
 function linux_version
 {
 	typeset ver="$1"
 
 	[ -z "$ver" ] && ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+")
 
 	typeset version major minor _
 	IFS='.' read -r version major minor _ <<<"$ver"
 
 	[ -z "$version" ] && version=0
 	[ -z "$major" ] && major=0
 	[ -z "$minor" ] && minor=0
 
 	echo $((version * 100000 + major * 1000 + minor))
 }
 
 # Determine if this is a Linux test system
 #
 # Return 0 if platform Linux, 1 if otherwise
 
 function is_linux
 {
 	[ "$UNAME" = "Linux" ]
 }
 
 # Determine if this is an illumos test system
 #
 # Return 0 if platform illumos, 1 if otherwise
 function is_illumos
 {
 	[ "$UNAME" = "illumos" ]
 }
 
 # Determine if this is a FreeBSD test system
 #
 # Return 0 if platform FreeBSD, 1 if otherwise
 
 function is_freebsd
 {
 	[ "$UNAME" = "FreeBSD" ]
 }
 
 # Determine if this is a 32-bit system
 #
 # Return 0 if platform is 32-bit, 1 if otherwise
 
 function is_32bit
 {
 	[ $(getconf LONG_BIT) = "32" ]
 }
 
 # Determine if kmemleak is enabled
 #
 # Return 0 if kmemleak is enabled, 1 if otherwise
 
 function is_kmemleak
 {
 	is_linux && [ -e /sys/kernel/debug/kmemleak ]
 }
 
 # Determine whether a dataset is mounted
 #
 # $1 dataset name
 # $2 filesystem type; optional - defaulted to zfs
 #
 # Return 0 if dataset is mounted; 1 if unmounted; 2 on error
 
 function ismounted
 {
 	typeset fstype=$2
 	[[ -z $fstype ]] && fstype=zfs
 	typeset out dir name
 
 	case $fstype in
 		zfs)
 			if [[ "$1" == "/"* ]] ; then
 				! zfs mount | awk -v fs="$1" '$2 == fs {exit 1}'
 			else
 				! zfs mount | awk -v ds="$1" '$1 == ds {exit 1}'
 			fi
 		;;
 		ufs|nfs)
 			if is_freebsd; then
 				mount -pt $fstype | while read dev dir _t _flags; do
 					[[ "$1" == "$dev" || "$1" == "$dir" ]] && return 0
 				done
 			else
 				out=$(df -F $fstype $1 2>/dev/null) || return
 
 				dir=${out%%\(*}
 				dir=${dir%% *}
 				name=${out##*\(}
 				name=${name%%\)*}
 				name=${name%% *}
 
 				[[ "$1" == "$dir" || "$1" == "$name" ]] && return 0
 			fi
 		;;
 		ext*)
 			df -t $fstype $1 > /dev/null 2>&1
 		;;
 		zvol)
 			if [[ -L "$ZVOL_DEVDIR/$1" ]]; then
 				link=$(readlink -f $ZVOL_DEVDIR/$1)
 				[[ -n "$link" ]] && \
 					mount | grep -q "^$link" && \
 						return 0
 			fi
 		;;
 		*)
 			false
 		;;
 	esac
 }
 
 # Return 0 if a dataset is mounted; 1 otherwise
 #
 # $1 dataset name
 # $2 filesystem type; optional - defaulted to zfs
 
 function mounted
 {
 	ismounted $1 $2
 }
 
 # Return 0 if a dataset is unmounted; 1 otherwise
 #
 # $1 dataset name
 # $2 filesystem type; optional - defaulted to zfs
 
 function unmounted
 {
 	! ismounted $1 $2
 }
 
 function default_setup
 {
 	default_setup_noexit "$@"
 
 	log_pass
 }
 
 function default_setup_no_mountpoint
 {
 	default_setup_noexit "$1" "$2" "$3" "yes"
 
 	log_pass
 }
 
 #
 # Given a list of disks, setup storage pools and datasets.
 #
 function default_setup_noexit
 {
 	typeset disklist=$1
 	typeset container=$2
 	typeset volume=$3
 	typeset no_mountpoint=$4
 	log_note begin default_setup_noexit
 
 	if is_global_zone; then
 		if poolexists $TESTPOOL ; then
 			destroy_pool $TESTPOOL
 		fi
 		[[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL
 		log_must zpool create -f $TESTPOOL $disklist
 	else
 		reexport_pool
 	fi
 
 	rm -rf $TESTDIR  || log_unresolved Could not remove $TESTDIR
 	mkdir -p $TESTDIR || log_unresolved Could not create $TESTDIR
 
 	log_must zfs create $TESTPOOL/$TESTFS
 	if [[ -z $no_mountpoint ]]; then
 		log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS
 	fi
 
 	if [[ -n $container ]]; then
 		rm -rf $TESTDIR1  || \
 			log_unresolved Could not remove $TESTDIR1
 		mkdir -p $TESTDIR1 || \
 			log_unresolved Could not create $TESTDIR1
 
 		log_must zfs create $TESTPOOL/$TESTCTR
 		log_must zfs set canmount=off $TESTPOOL/$TESTCTR
 		log_must zfs create $TESTPOOL/$TESTCTR/$TESTFS1
 		if [[ -z $no_mountpoint ]]; then
 			log_must zfs set mountpoint=$TESTDIR1 \
 			    $TESTPOOL/$TESTCTR/$TESTFS1
 		fi
 	fi
 
 	if [[ -n $volume ]]; then
 		if is_global_zone ; then
 			log_must zfs create -V $VOLSIZE $TESTPOOL/$TESTVOL
 			block_device_wait
 		else
 			log_must zfs create $TESTPOOL/$TESTVOL
 		fi
 	fi
 }
 
 #
 # Given a list of disks, setup a storage pool, file system and
 # a container.
 #
 function default_container_setup
 {
 	typeset disklist=$1
 
 	default_setup "$disklist" "true"
 }
 
 #
 # Given a list of disks, setup a storage pool,file system
 # and a volume.
 #
 function default_volume_setup
 {
 	typeset disklist=$1
 
 	default_setup "$disklist" "" "true"
 }
 
 #
 # Given a list of disks, setup a storage pool,file system,
 # a container and a volume.
 #
 function default_container_volume_setup
 {
 	typeset disklist=$1
 
 	default_setup "$disklist" "true" "true"
 }
 
 #
 # Create a snapshot on a filesystem or volume. Defaultly create a snapshot on
 # filesystem
 #
 # $1 Existing filesystem or volume name. Default, $TESTPOOL/$TESTFS
 # $2 snapshot name. Default, $TESTSNAP
 #
 function create_snapshot
 {
 	typeset fs_vol=${1:-$TESTPOOL/$TESTFS}
 	typeset snap=${2:-$TESTSNAP}
 
 	[[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined."
 	[[ -z $snap ]] && log_fail "Snapshot's name is undefined."
 
 	if snapexists $fs_vol@$snap; then
 		log_fail "$fs_vol@$snap already exists."
 	fi
 	datasetexists $fs_vol || \
 		log_fail "$fs_vol must exist."
 
 	log_must zfs snapshot $fs_vol@$snap
 }
 
 #
 # Create a clone from a snapshot, default clone name is $TESTCLONE.
 #
 # $1 Existing snapshot, $TESTPOOL/$TESTFS@$TESTSNAP is default.
 # $2 Clone name, $TESTPOOL/$TESTCLONE is default.
 #
 function create_clone   # snapshot clone
 {
 	typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP}
 	typeset clone=${2:-$TESTPOOL/$TESTCLONE}
 
 	[[ -z $snap ]] && \
 		log_fail "Snapshot name is undefined."
 	[[ -z $clone ]] && \
 		log_fail "Clone name is undefined."
 
 	log_must zfs clone $snap $clone
 }
 
 #
 # Create a bookmark of the given snapshot.  Defaultly create a bookmark on
 # filesystem.
 #
 # $1 Existing filesystem or volume name. Default, $TESTFS
 # $2 Existing snapshot name. Default, $TESTSNAP
 # $3 bookmark name. Default, $TESTBKMARK
 #
 function create_bookmark
 {
 	typeset fs_vol=${1:-$TESTFS}
 	typeset snap=${2:-$TESTSNAP}
 	typeset bkmark=${3:-$TESTBKMARK}
 
 	[[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined."
 	[[ -z $snap ]] && log_fail "Snapshot's name is undefined."
 	[[ -z $bkmark ]] && log_fail "Bookmark's name is undefined."
 
 	if bkmarkexists $fs_vol#$bkmark; then
 		log_fail "$fs_vol#$bkmark already exists."
 	fi
 	datasetexists $fs_vol || \
 		log_fail "$fs_vol must exist."
 	snapexists $fs_vol@$snap || \
 		log_fail "$fs_vol@$snap must exist."
 
 	log_must zfs bookmark $fs_vol@$snap $fs_vol#$bkmark
 }
 
 #
 # Create a temporary clone result of an interrupted resumable 'zfs receive'
 # $1 Destination filesystem name. Must not exist, will be created as the result
 #    of this function along with its %recv temporary clone
 # $2 Source filesystem name. Must not exist, will be created and destroyed
 #
 function create_recv_clone
 {
 	typeset recvfs="$1"
 	typeset sendfs="${2:-$TESTPOOL/create_recv_clone}"
 	typeset snap="$sendfs@snap1"
 	typeset incr="$sendfs@snap2"
 	typeset mountpoint="$TESTDIR/create_recv_clone"
 	typeset sendfile="$TESTDIR/create_recv_clone.zsnap"
 
 	[[ -z $recvfs ]] && log_fail "Recv filesystem's name is undefined."
 
 	datasetexists $recvfs && log_fail "Recv filesystem must not exist."
 	datasetexists $sendfs && log_fail "Send filesystem must not exist."
 
 	log_must zfs create -o compression=off -o mountpoint="$mountpoint" $sendfs
 	log_must zfs snapshot $snap
 	log_must eval "zfs send $snap | zfs recv -u $recvfs"
 	log_must mkfile 1m "$mountpoint/data"
 	log_must zfs snapshot $incr
 	log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 \
 	    iflag=fullblock > $sendfile"
 	log_mustnot eval "zfs recv -su $recvfs < $sendfile"
 	destroy_dataset "$sendfs" "-r"
 	log_must rm -f "$sendfile"
 
 	if [[ $(get_prop 'inconsistent' "$recvfs/%recv") -ne 1 ]]; then
 		log_fail "Error creating temporary $recvfs/%recv clone"
 	fi
 }
 
 function default_mirror_setup
 {
 	default_mirror_setup_noexit $1 $2 $3
 
 	log_pass
 }
 
 #
 # Given a pair of disks, set up a storage pool and dataset for the mirror
 # @parameters: $1 the primary side of the mirror
 #   $2 the secondary side of the mirror
 # @uses: ZPOOL ZFS TESTPOOL TESTFS
 function default_mirror_setup_noexit
 {
 	readonly func="default_mirror_setup_noexit"
 	typeset primary=$1
 	typeset secondary=$2
 
 	[[ -z $primary ]] && \
 		log_fail "$func: No parameters passed"
 	[[ -z $secondary ]] && \
 		log_fail "$func: No secondary partition passed"
 	[[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL
 	log_must zpool create -f $TESTPOOL mirror $@
 	log_must zfs create $TESTPOOL/$TESTFS
 	log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS
 }
 
 #
 # Destroy the configured testpool mirrors.
 # the mirrors are of the form ${TESTPOOL}{number}
 # @uses: ZPOOL ZFS TESTPOOL
 function destroy_mirrors
 {
 	default_cleanup_noexit
 
 	log_pass
 }
 
 function default_raidz_setup
 {
 	default_raidz_setup_noexit "$*"
 
 	log_pass
 }
 
 #
 # Given a minimum of two disks, set up a storage pool and dataset for the raid-z
 # $1 the list of disks
 #
 function default_raidz_setup_noexit
 {
 	typeset disklist="$*"
 	disks=(${disklist[*]})
 
 	if [[ ${#disks[*]} -lt 2 ]]; then
 		log_fail "A raid-z requires a minimum of two disks."
 	fi
 
 	[[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL
 	log_must zpool create -f $TESTPOOL raidz $disklist
 	log_must zfs create $TESTPOOL/$TESTFS
 	log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS
 }
 
 #
 # Common function used to cleanup storage pools and datasets.
 #
 # Invoked at the start of the test suite to ensure the system
 # is in a known state, and also at the end of each set of
 # sub-tests to ensure errors from one set of tests doesn't
 # impact the execution of the next set.
 
 function default_cleanup
 {
 	default_cleanup_noexit
 
 	log_pass
 }
 
 #
 # Utility function used to list all available pool names.
 #
 # NOTE: $KEEP is a variable containing pool names, separated by a newline
 # character, that must be excluded from the returned list.
 #
 function get_all_pools
 {
 	zpool list -H -o name | grep -Fvx "$KEEP" | grep -v "$NO_POOLS"
 }
 
 function default_cleanup_noexit
 {
 	typeset pool=""
 	#
 	# Destroying the pool will also destroy any
 	# filesystems it contains.
 	#
 	if is_global_zone; then
 		zfs unmount -a > /dev/null 2>&1
 		ALL_POOLS=$(get_all_pools)
 		# Here, we loop through the pools we're allowed to
 		# destroy, only destroying them if it's safe to do
 		# so.
 		while [ ! -z ${ALL_POOLS} ]
 		do
 			for pool in ${ALL_POOLS}
 			do
 				if safe_to_destroy_pool $pool ;
 				then
 					destroy_pool $pool
 				fi
 			done
 			ALL_POOLS=$(get_all_pools)
 		done
 
 		zfs mount -a
 	else
 		typeset fs=""
 		for fs in $(zfs list -H -o name \
 		    | grep "^$ZONE_POOL/$ZONE_CTR[01234]/"); do
 			destroy_dataset "$fs" "-Rf"
 		done
 
 		# Need cleanup here to avoid garbage dir left.
 		for fs in $(zfs list -H -o name); do
 			[[ $fs == /$ZONE_POOL ]] && continue
 			[[ -d $fs ]] && log_must rm -rf $fs/*
 		done
 
 		#
 		# Reset the $ZONE_POOL/$ZONE_CTR[01234] file systems property to
 		# the default value
 		#
 		for fs in $(zfs list -H -o name); do
 			if [[ $fs == $ZONE_POOL/$ZONE_CTR[01234] ]]; then
 				log_must zfs set reservation=none $fs
 				log_must zfs set recordsize=128K $fs
 				log_must zfs set mountpoint=/$fs $fs
 				typeset enc=$(get_prop encryption $fs)
 				if [ -z "$enc" ] || [ "$enc" = "off" ]; then
 					log_must zfs set checksum=on $fs
 				fi
 				log_must zfs set compression=off $fs
 				log_must zfs set atime=on $fs
 				log_must zfs set devices=off $fs
 				log_must zfs set exec=on $fs
 				log_must zfs set setuid=on $fs
 				log_must zfs set readonly=off $fs
 				log_must zfs set snapdir=hidden $fs
 				log_must zfs set aclmode=groupmask $fs
 				log_must zfs set aclinherit=secure $fs
 			fi
 		done
 	fi
 
 	[[ -d $TESTDIR ]] && \
 		log_must rm -rf $TESTDIR
 
 	disk1=${DISKS%% *}
 	if is_mpath_device $disk1; then
 		delete_partitions
 	fi
 
 	rm -f $TEST_BASE_DIR/{err,out}
 }
 
 
 #
 # Common function used to cleanup storage pools, file systems
 # and containers.
 #
 function default_container_cleanup
 {
 	if ! is_global_zone; then
 		reexport_pool
 	fi
 
 	ismounted $TESTPOOL/$TESTCTR/$TESTFS1 &&
 	    log_must zfs unmount $TESTPOOL/$TESTCTR/$TESTFS1
 
 	destroy_dataset "$TESTPOOL/$TESTCTR/$TESTFS1" "-R"
 	destroy_dataset "$TESTPOOL/$TESTCTR" "-Rf"
 
 	[[ -e $TESTDIR1 ]] && \
 	    log_must rm -rf $TESTDIR1
 
 	default_cleanup
 }
 
 #
 # Common function used to cleanup snapshot of file system or volume. Default to
 # delete the file system's snapshot
 #
 # $1 snapshot name
 #
 function destroy_snapshot
 {
 	typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP}
 
 	if ! snapexists $snap; then
 		log_fail "'$snap' does not exist."
 	fi
 
 	#
 	# For the sake of the value which come from 'get_prop' is not equal
 	# to the really mountpoint when the snapshot is unmounted. So, firstly
 	# check and make sure this snapshot's been mounted in current system.
 	#
 	typeset mtpt=""
 	if ismounted $snap; then
 		mtpt=$(get_prop mountpoint $snap)
 	fi
 
 	destroy_dataset "$snap"
 	[[ $mtpt != "" && -d $mtpt ]] && \
 		log_must rm -rf $mtpt
 }
 
 #
 # Common function used to cleanup clone.
 #
 # $1 clone name
 #
 function destroy_clone
 {
 	typeset clone=${1:-$TESTPOOL/$TESTCLONE}
 
 	if ! datasetexists $clone; then
 		log_fail "'$clone' does not existed."
 	fi
 
 	# With the same reason in destroy_snapshot
 	typeset mtpt=""
 	if ismounted $clone; then
 		mtpt=$(get_prop mountpoint $clone)
 	fi
 
 	destroy_dataset "$clone"
 	[[ $mtpt != "" && -d $mtpt ]] && \
 		log_must rm -rf $mtpt
 }
 
 #
 # Common function used to cleanup bookmark of file system or volume.  Default
 # to delete the file system's bookmark.
 #
 # $1 bookmark name
 #
 function destroy_bookmark
 {
 	typeset bkmark=${1:-$TESTPOOL/$TESTFS#$TESTBKMARK}
 
 	if ! bkmarkexists $bkmark; then
 		log_fail "'$bkmarkp' does not existed."
 	fi
 
 	destroy_dataset "$bkmark"
 }
 
 # Return 0 if a snapshot exists; $? otherwise
 #
 # $1 - snapshot name
 
 function snapexists
 {
 	zfs list -H -t snapshot "$1" > /dev/null 2>&1
 }
 
 #
 # Return 0 if a bookmark exists; $? otherwise
 #
 # $1 - bookmark name
 #
 function bkmarkexists
 {
 	zfs list -H -t bookmark "$1" > /dev/null 2>&1
 }
 
 #
 # Return 0 if a hold exists; $? otherwise
 #
 # $1 - hold tag
 # $2 - snapshot name
 #
 function holdexists
 {
 	! zfs holds "$2" | awk -v t="$1" '$2 ~ t { exit 1 }'
 }
 
 #
 # Set a property to a certain value on a dataset.
 # Sets a property of the dataset to the value as passed in.
 # @param:
 #	$1 dataset who's property is being set
 #	$2 property to set
 #	$3 value to set property to
 # @return:
 #	0 if the property could be set.
 #	non-zero otherwise.
 # @use: ZFS
 #
 function dataset_setprop
 {
 	typeset fn=dataset_setprop
 
 	if (($# < 3)); then
 		log_note "$fn: Insufficient parameters (need 3, had $#)"
 		return 1
 	fi
 	typeset output=
 	output=$(zfs set $2=$3 $1 2>&1)
 	typeset rv=$?
 	if ((rv != 0)); then
 		log_note "Setting property on $1 failed."
 		log_note "property $2=$3"
 		log_note "Return Code: $rv"
 		log_note "Output: $output"
 		return $rv
 	fi
 	return 0
 }
 
 #
 # Check a numeric assertion
 # @parameter: $@ the assertion to check
 # @output: big loud notice if assertion failed
 # @use: log_fail
 #
 function assert
 {
 	(($@)) || log_fail "$@"
 }
 
 #
 # Function to format partition size of a disk
 # Given a disk cxtxdx reduces all partitions
 # to 0 size
 #
 function zero_partitions #<whole_disk_name>
 {
 	typeset diskname=$1
 	typeset i
 
 	if is_freebsd; then
 		gpart destroy -F $diskname
 	elif is_linux; then
 		DSK=$DEV_DSKDIR/$diskname
 		DSK=$(echo $DSK | sed -e "s|//|/|g")
 		log_must parted $DSK -s -- mklabel gpt
 		blockdev --rereadpt $DSK 2>/dev/null
 		block_device_wait
 	else
 		for i in 0 1 3 4 5 6 7
 		do
 			log_must set_partition $i "" 0mb $diskname
 		done
 	fi
 
 	return 0
 }
 
 #
 # Given a slice, size and disk, this function
 # formats the slice to the specified size.
 # Size should be specified with units as per
 # the `format` command requirements eg. 100mb 3gb
 #
 # NOTE: This entire interface is problematic for the Linux parted utility
 # which requires the end of the partition to be specified.  It would be
 # best to retire this interface and replace it with something more flexible.
 # At the moment a best effort is made.
 #
 # arguments: <slice_num> <slice_start> <size_plus_units>  <whole_disk_name>
 function set_partition
 {
 	typeset -i slicenum=$1
 	typeset start=$2
 	typeset size=$3
 	typeset disk=${4#$DEV_DSKDIR/}
 	disk=${disk#$DEV_RDSKDIR/}
 
 	case "$UNAME" in
 	Linux)
 		if [[ -z $size || -z $disk ]]; then
 			log_fail "The size or disk name is unspecified."
 		fi
 		disk=$DEV_DSKDIR/$disk
 		typeset size_mb=${size%%[mMgG]}
 
 		size_mb=${size_mb%%[mMgG][bB]}
 		if [[ ${size:1:1} == 'g' ]]; then
 			((size_mb = size_mb * 1024))
 		fi
 
 		# Create GPT partition table when setting slice 0 or
 		# when the device doesn't already contain a GPT label.
 		parted $disk -s -- print 1 >/dev/null
 		typeset ret_val=$?
 		if [[ $slicenum -eq 0 || $ret_val -ne 0 ]]; then
 			if ! parted $disk -s -- mklabel gpt; then
 				log_note "Failed to create GPT partition table on $disk"
 				return 1
 			fi
 		fi
 
 		# When no start is given align on the first cylinder.
 		if [[ -z "$start" ]]; then
 			start=1
 		fi
 
 		# Determine the cylinder size for the device and using
 		# that calculate the end offset in cylinders.
 		typeset -i cly_size_kb=0
 		cly_size_kb=$(parted -m $disk -s -- unit cyl print |
 			awk -F '[:k.]' 'NR == 3 {print $4}')
 		((end = (size_mb * 1024 / cly_size_kb) + start))
 
 		parted $disk -s -- \
 		    mkpart part$slicenum ${start}cyl ${end}cyl
 		typeset ret_val=$?
 		if [[ $ret_val -ne 0 ]]; then
 			log_note "Failed to create partition $slicenum on $disk"
 			return 1
 		fi
 
 		blockdev --rereadpt $disk 2>/dev/null
 		block_device_wait $disk
 		;;
 	FreeBSD)
 		if [[ -z $size || -z $disk ]]; then
 			log_fail "The size or disk name is unspecified."
 		fi
 		disk=$DEV_DSKDIR/$disk
 
 		if [[ $slicenum -eq 0 ]] || ! gpart show $disk >/dev/null 2>&1; then
 			gpart destroy -F $disk >/dev/null 2>&1
 			if ! gpart create -s GPT $disk; then
 				log_note "Failed to create GPT partition table on $disk"
 				return 1
 			fi
 		fi
 
 		typeset index=$((slicenum + 1))
 
 		if [[ -n $start ]]; then
 			start="-b $start"
 		fi
 		gpart add -t freebsd-zfs $start -s $size -i $index $disk
 		if [[ $ret_val -ne 0 ]]; then
 			log_note "Failed to create partition $slicenum on $disk"
 			return 1
 		fi
 
 		block_device_wait $disk
 		;;
 	*)
 		if [[ -z $slicenum || -z $size || -z $disk ]]; then
 			log_fail "The slice, size or disk name is unspecified."
 		fi
 
 		typeset format_file=/var/tmp/format_in.$$
 
 		echo "partition" >$format_file
 		echo "$slicenum" >> $format_file
 		echo "" >> $format_file
 		echo "" >> $format_file
 		echo "$start" >> $format_file
 		echo "$size" >> $format_file
 		echo "label" >> $format_file
 		echo "" >> $format_file
 		echo "q" >> $format_file
 		echo "q" >> $format_file
 
 		format -e -s -d $disk -f $format_file
 		typeset ret_val=$?
 		rm -f $format_file
 		;;
 	esac
 
 	if [[ $ret_val -ne 0 ]]; then
 		log_note "Unable to format $disk slice $slicenum to $size"
 		return 1
 	fi
 	return 0
 }
 
 #
 # Delete all partitions on all disks - this is specifically for the use of multipath
 # devices which currently can only be used in the test suite as raw/un-partitioned
 # devices (ie a zpool cannot be created on a whole mpath device that has partitions)
 #
 function delete_partitions
 {
 	typeset disk
 
 	if [[ -z $DISKSARRAY ]]; then
 		DISKSARRAY=$DISKS
 	fi
 
 	if is_linux; then
 		typeset -i part
 		for disk in $DISKSARRAY; do
 			for (( part = 1; part < MAX_PARTITIONS; part++ )); do
 				typeset partition=${disk}${SLICE_PREFIX}${part}
 				parted $DEV_DSKDIR/$disk -s rm $part > /dev/null 2>&1
 				if lsblk | grep -qF ${partition}; then
 					log_fail "Partition ${partition} not deleted"
 				else
 					log_note "Partition ${partition} deleted"
 				fi
 			done
 		done
 	elif is_freebsd; then
 		for disk in $DISKSARRAY; do
 			if gpart destroy -F $disk; then
 				log_note "Partitions for ${disk} deleted"
 			else
 				log_fail "Partitions for ${disk} not deleted"
 			fi
 		done
 	fi
 }
 
 #
 # Get the end cyl of the given slice
 #
 function get_endslice #<disk> <slice>
 {
 	typeset disk=$1
 	typeset slice=$2
 	if [[ -z $disk || -z $slice ]] ; then
 		log_fail "The disk name or slice number is unspecified."
 	fi
 
 	case "$UNAME" in
 	Linux)
 		endcyl=$(parted -s $DEV_DSKDIR/$disk -- unit cyl print | \
 			awk "/part${slice}/"' {sub(/cyl/, "", $3); print $3}')
 		((endcyl = (endcyl + 1)))
 		;;
 	FreeBSD)
 		disk=${disk#/dev/zvol/}
 		disk=${disk%p*}
 		slice=$((slice + 1))
 		endcyl=$(gpart show $disk | \
 			awk -v slice=$slice '$3 == slice { print $1 + $2 }')
 		;;
 	*)
 		disk=${disk#/dev/dsk/}
 		disk=${disk#/dev/rdsk/}
 		disk=${disk%s*}
 
 		typeset -i ratio=0
 		ratio=$(prtvtoc /dev/rdsk/${disk}s2 | \
 		    awk '/sectors\/cylinder/ {print $2}')
 
 		if ((ratio == 0)); then
 			return
 		fi
 
 		typeset -i endcyl=$(prtvtoc -h /dev/rdsk/${disk}s2 |
 		    awk -v token="$slice" '$1 == token {print $6}')
 
 		((endcyl = (endcyl + 1) / ratio))
 		;;
 	esac
 
 	echo $endcyl
 }
 
 
 #
 # Given a size,disk and total slice number,  this function formats the
 # disk slices from 0 to the total slice number with the same specified
 # size.
 #
 function partition_disk	#<slice_size> <whole_disk_name>	<total_slices>
 {
 	typeset -i i=0
 	typeset slice_size=$1
 	typeset disk_name=$2
 	typeset total_slices=$3
 	typeset cyl
 
 	zero_partitions $disk_name
 	while ((i < $total_slices)); do
 		if ! is_linux; then
 			if ((i == 2)); then
 				((i = i + 1))
 				continue
 			fi
 		fi
 		log_must set_partition $i "$cyl" $slice_size $disk_name
 		cyl=$(get_endslice $disk_name $i)
 		((i = i+1))
 	done
 }
 
 #
 # This function continues to write to a filenum number of files into dirnum
 # number of directories until either file_write returns an error or the
 # maximum number of files per directory have been written.
 #
 # Usage:
 # fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data]
 #
 # Return value: 0 on success
 #		non 0 on error
 #
 # Where :
 #	destdir:    is the directory where everything is to be created under
 #	dirnum:	    the maximum number of subdirectories to use, -1 no limit
 #	filenum:    the maximum number of files per subdirectory
 #	bytes:	    number of bytes to write
 #	num_writes: number of types to write out bytes
 #	data:	    the data that will be written
 #
 #	E.g.
 #	fill_fs /testdir 20 25 1024 256 0
 #
 # Note: bytes * num_writes equals the size of the testfile
 #
 function fill_fs # destdir dirnum filenum bytes num_writes data
 {
 	typeset destdir=${1:-$TESTDIR}
 	typeset -i dirnum=${2:-50}
 	typeset -i filenum=${3:-50}
 	typeset -i bytes=${4:-8192}
 	typeset -i num_writes=${5:-10240}
 	typeset data=${6:-0}
 
 	mkdir -p $destdir/{1..$dirnum}
 	for f in $destdir/{1..$dirnum}/$TESTFILE{1..$filenum}; do
 		file_write -o create -f $f -b $bytes -c $num_writes -d $data \
 		|| return
 	done
 }
 
 # Get the specified dataset property in parsable format or fail
 function get_prop # property dataset
 {
 	typeset prop=$1
 	typeset dataset=$2
 
 	zfs get -Hpo value "$prop" "$dataset" || log_fail "zfs get $prop $dataset"
 }
 
 # Get the specified pool property in parsable format or fail
 function get_pool_prop # property pool
 {
 	typeset prop=$1
 	typeset pool=$2
 
 	zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
 }
 
 # Return 0 if a pool exists; $? otherwise
 #
 # $1 - pool name
 
 function poolexists
 {
 	typeset pool=$1
 
 	if [[ -z $pool ]]; then
 		log_note "No pool name given."
 		return 1
 	fi
 
 	zpool get name "$pool" > /dev/null 2>&1
 }
 
 # Return 0 if all the specified datasets exist; $? otherwise
 #
 # $1-n  dataset name
 function datasetexists
 {
 	if (($# == 0)); then
 		log_note "No dataset name given."
 		return 1
 	fi
 
 	zfs get name "$@" > /dev/null 2>&1
 }
 
 # return 0 if none of the specified datasets exists, otherwise return 1.
 #
 # $1-n  dataset name
 function datasetnonexists
 {
 	if (($# == 0)); then
 		log_note "No dataset name given."
 		return 1
 	fi
 
 	while (($# > 0)); do
 		zfs list -H -t filesystem,snapshot,volume $1 > /dev/null 2>&1 \
 		    && return 1
 		shift
 	done
 
 	return 0
 }
 
 # FreeBSD breaks exports(5) at whitespace and doesn't process escapes
 # Solaris just breaks
 #
 # cf. https://github.com/openzfs/zfs/pull/13165#issuecomment-1059845807
 #
 # Linux can have spaces (which are \OOO-escaped),
 # but can't have backslashes because they're parsed recursively
 function shares_can_have_whitespace
 {
 	is_linux
 }
 
 function is_shared_freebsd
 {
 	typeset fs=$1
 
 	pgrep -q mountd && showmount -E | grep -qx "$fs"
 }
 
 function is_shared_illumos
 {
 	typeset fs=$1
 	typeset mtpt
 
 	for mtpt in `share | awk '{print $2}'` ; do
 		if [[ $mtpt == $fs ]] ; then
 			return 0
 		fi
 	done
 
 	typeset stat=$(svcs -H -o STA nfs/server:default)
 	if [[ $stat != "ON" ]]; then
 		log_note "Current nfs/server status: $stat"
 	fi
 
 	return 1
 }
 
 function is_shared_linux
 {
 	typeset fs=$1
 	! exportfs -s | awk -v fs="${fs//\\/\\\\}" '/^\// && $1 == fs {exit 1}'
 }
 
 #
 # Given a mountpoint, or a dataset name, determine if it is shared via NFS.
 #
 # Returns 0 if shared, 1 otherwise.
 #
 function is_shared
 {
 	typeset fs=$1
 	typeset mtpt
 
 	if [[ $fs != "/"* ]] ; then
 		if datasetnonexists "$fs" ; then
 			return 1
 		else
 			mtpt=$(get_prop mountpoint "$fs")
 			case "$mtpt" in
 				none|legacy|-) return 1
 					;;
 				*)	fs=$mtpt
 					;;
 			esac
 		fi
 	fi
 
 	case "$UNAME" in
 	FreeBSD)	is_shared_freebsd "$fs"	;;
 	Linux)		is_shared_linux "$fs"	;;
 	*)		is_shared_illumos "$fs"	;;
 	esac
 }
 
 function is_exported_illumos
 {
 	typeset fs=$1
 	typeset mtpt _
 
 	while read -r mtpt _; do
 		[ "$mtpt" = "$fs" ] && return
 	done < /etc/dfs/sharetab
 
 	return 1
 }
 
 function is_exported_freebsd
 {
 	typeset fs=$1
 	typeset mtpt _
 
 	while read -r mtpt _; do
 		[ "$mtpt" = "$fs" ] && return
 	done < /etc/zfs/exports
 
 	return 1
 }
 
 function is_exported_linux
 {
 	typeset fs=$1
 	typeset mtpt _
 
 	while read -r mtpt _; do
 		[ "$(printf "$mtpt")" = "$fs" ] && return
 	done < /etc/exports.d/zfs.exports
 
 	return 1
 }
 
 #
 # Given a mountpoint, or a dataset name, determine if it is exported via
 # the os-specific NFS exports file.
 #
 # Returns 0 if exported, 1 otherwise.
 #
 function is_exported
 {
 	typeset fs=$1
 	typeset mtpt
 
 	if [[ $fs != "/"* ]] ; then
 		if datasetnonexists "$fs" ; then
 			return 1
 		else
 			mtpt=$(get_prop mountpoint "$fs")
 			case $mtpt in
 				none|legacy|-) return 1
 					;;
 				*)	fs=$mtpt
 					;;
 			esac
 		fi
 	fi
 
 	case "$UNAME" in
 	FreeBSD)	is_exported_freebsd "$fs"	;;
 	Linux)		is_exported_linux "$fs"	;;
 	*)		is_exported_illumos "$fs"	;;
 	esac
 }
 
 #
 # Given a dataset name determine if it is shared via SMB.
 #
 # Returns 0 if shared, 1 otherwise.
 #
 function is_shared_smb
 {
 	typeset fs=$1
 
 	datasetexists "$fs" || return
 
 	if is_linux; then
 		net usershare list | grep -xFq "${fs//[-\/]/_}"
 	else
 		log_note "SMB on $UNAME currently unsupported by the test framework"
 		return 1
 	fi
 }
 
 #
 # Given a mountpoint, determine if it is not shared via NFS.
 #
 # Returns 0 if not shared, 1 otherwise.
 #
 function not_shared
 {
 	! is_shared $1
 }
 
 #
 # Given a dataset determine if it is not shared via SMB.
 #
 # Returns 0 if not shared, 1 otherwise.
 #
 function not_shared_smb
 {
 	! is_shared_smb $1
 }
 
 #
 # Helper function to unshare a mountpoint.
 #
 function unshare_fs #fs
 {
 	typeset fs=$1
 
 	if is_shared $fs || is_shared_smb $fs; then
 		log_must zfs unshare $fs
 	fi
 }
 
 #
 # Helper function to share a NFS mountpoint.
 #
 function share_nfs #fs
 {
 	typeset fs=$1
 
 	is_shared "$fs" && return
 
 	case "$UNAME" in
 	Linux)
 		log_must exportfs "*:$fs"
 		;;
 	FreeBSD)
 		typeset mountd
 		read -r mountd < /var/run/mountd.pid
 		log_must eval "printf '%s\t\n' \"$fs\" >> /etc/zfs/exports"
 		log_must kill -s HUP "$mountd"
 		;;
 	*)
 		log_must share -F nfs "$fs"
 		;;
 	esac
 
 	return 0
 }
 
 #
 # Helper function to unshare a NFS mountpoint.
 #
 function unshare_nfs #fs
 {
 	typeset fs=$1
 
 	! is_shared "$fs" && return
 
 	case "$UNAME" in
 	Linux)
 		log_must exportfs -u "*:$fs"
 		;;
 	FreeBSD)
 		typeset mountd
 		read -r mountd < /var/run/mountd.pid
 		awk -v fs="${fs//\\/\\\\}" '$1 != fs' /etc/zfs/exports > /etc/zfs/exports.$$
 		log_must mv /etc/zfs/exports.$$ /etc/zfs/exports
 		log_must kill -s HUP "$mountd"
 		;;
 	*)
 		log_must unshare -F nfs $fs
 		;;
 	esac
 
 	return 0
 }
 
 #
 # Helper function to show NFS shares.
 #
 function showshares_nfs
 {
 	case "$UNAME" in
 	Linux)
 		exportfs -v
 		;;
 	FreeBSD)
 		showmount
 		;;
 	*)
 		share -F nfs
 		;;
 	esac
 }
 
 function check_nfs
 {
 	case "$UNAME" in
 	Linux)
 		exportfs -s
 		;;
 	FreeBSD)
 		showmount -e
 		;;
 	*)
 		log_unsupported "Unknown platform"
 		;;
 	esac || log_unsupported "The NFS utilities are not installed"
 }
 
 #
 # Check NFS server status and trigger it online.
 #
 function setup_nfs_server
 {
 	# Cannot share directory in non-global zone.
 	#
 	if ! is_global_zone; then
 		log_note "Cannot trigger NFS server by sharing in LZ."
 		return
 	fi
 
 	if is_linux; then
 		#
 		# Re-synchronize /var/lib/nfs/etab with /etc/exports and
 		# /etc/exports.d./* to provide a clean test environment.
 		#
 		log_must exportfs -r
 
 		log_note "NFS server must be started prior to running ZTS."
 		return
 	elif is_freebsd; then
 		log_must kill -s HUP $(</var/run/mountd.pid)
 
 		log_note "NFS server must be started prior to running ZTS."
 		return
 	fi
 
 	typeset nfs_fmri="svc:/network/nfs/server:default"
 	if [[ $(svcs -Ho STA $nfs_fmri) != "ON" ]]; then
 		#
 		# Only really sharing operation can enable NFS server
 		# to online permanently.
 		#
 		typeset dummy=/tmp/dummy
 
 		if [[ -d $dummy ]]; then
 			log_must rm -rf $dummy
 		fi
 
 		log_must mkdir $dummy
 		log_must share $dummy
 
 		#
 		# Waiting for fmri's status to be the final status.
 		# Otherwise, in transition, an asterisk (*) is appended for
 		# instances, unshare will reverse status to 'DIS' again.
 		#
 		# Waiting for 1's at least.
 		#
 		log_must sleep 1
 		timeout=10
 		while [[ timeout -ne 0 && $(svcs -Ho STA $nfs_fmri) == *'*' ]]
 		do
 			log_must sleep 1
 
 			((timeout -= 1))
 		done
 
 		log_must unshare $dummy
 		log_must rm -rf $dummy
 	fi
 
 	log_note "Current NFS status: '$(svcs -Ho STA,FMRI $nfs_fmri)'"
 }
 
 #
 # To verify whether calling process is in global zone
 #
 # Return 0 if in global zone, 1 in non-global zone
 #
 function is_global_zone
 {
 	if is_linux || is_freebsd; then
 		return 0
 	else
 		typeset cur_zone=$(zonename 2>/dev/null)
 		[ $cur_zone = "global" ]
 	fi
 }
 
 #
 # Verify whether test is permitted to run from
 # global zone, local zone, or both
 #
 # $1 zone limit, could be "global", "local", or "both"(no limit)
 #
 # Return 0 if permitted, otherwise exit with log_unsupported
 #
 function verify_runnable # zone limit
 {
 	typeset limit=$1
 
 	[[ -z $limit ]] && return 0
 
 	if is_global_zone ; then
 		case $limit in
 			global|both)
 				;;
 			local)	log_unsupported "Test is unable to run from "\
 					"global zone."
 				;;
 			*)	log_note "Warning: unknown limit $limit - " \
 					"use both."
 				;;
 		esac
 	else
 		case $limit in
 			local|both)
 				;;
 			global)	log_unsupported "Test is unable to run from "\
 					"local zone."
 				;;
 			*)	log_note "Warning: unknown limit $limit - " \
 					"use both."
 				;;
 		esac
 
 		reexport_pool
 	fi
 
 	return 0
 }
 
 # Return 0 if create successfully or the pool exists; $? otherwise
 # Note: In local zones, this function should return 0 silently.
 #
 # $1 - pool name
 # $2-n - [keyword] devs_list
 
 function create_pool #pool devs_list
 {
 	typeset pool=${1%%/*}
 
 	shift
 
 	if [[ -z $pool ]]; then
 		log_note "Missing pool name."
 		return 1
 	fi
 
 	if poolexists $pool ; then
 		destroy_pool $pool
 	fi
 
 	if is_global_zone ; then
 		[[ -d /$pool ]] && rm -rf /$pool
 		log_must zpool create -f $pool $@
 	fi
 
 	return 0
 }
 
 # Return 0 if destroy successfully or the pool exists; $? otherwise
 # Note: In local zones, this function should return 0 silently.
 #
 # $1 - pool name
 # Destroy pool with the given parameters.
 
 function destroy_pool #pool
 {
 	typeset pool=${1%%/*}
 	typeset mtpt
 
 	if [[ -z $pool ]]; then
 		log_note "No pool name given."
 		return 1
 	fi
 
 	if is_global_zone ; then
 		if poolexists "$pool" ; then
 			mtpt=$(get_prop mountpoint "$pool")
 
 			# At times, syseventd/udev activity can cause attempts
 			# to destroy a pool to fail with EBUSY. We retry a few
 			# times allowing failures before requiring the destroy
 			# to succeed.
 			log_must_busy zpool destroy -f $pool
 
 			[[ -d $mtpt ]] && \
 				log_must rm -rf $mtpt
 		else
 			log_note "Pool does not exist. ($pool)"
 			return 1
 		fi
 	fi
 
 	return 0
 }
 
 # Return 0 if created successfully; $? otherwise
 #
 # $1 - dataset name
 # $2-n - dataset options
 
 function create_dataset #dataset dataset_options
 {
 	typeset dataset=$1
 
 	shift
 
 	if [[ -z $dataset ]]; then
 		log_note "Missing dataset name."
 		return 1
 	fi
 
 	if datasetexists $dataset ; then
 		destroy_dataset $dataset
 	fi
 
 	log_must zfs create $@ $dataset
 
 	return 0
 }
 
 # Return 0 if destroy successfully or the dataset exists; $? otherwise
 # Note: In local zones, this function should return 0 silently.
 #
 # $1 - dataset name
 # $2 - custom arguments for zfs destroy
 # Destroy dataset with the given parameters.
 
 function destroy_dataset # dataset [args]
 {
 	typeset dataset=$1
 	typeset mtpt
 	typeset args=${2:-""}
 
 	if [[ -z $dataset ]]; then
 		log_note "No dataset name given."
 		return 1
 	fi
 
 	if is_global_zone ; then
 		if datasetexists "$dataset" ; then
 			mtpt=$(get_prop mountpoint "$dataset")
 			log_must_busy zfs destroy $args $dataset
 
 			[ -d $mtpt ] && log_must rm -rf $mtpt
 		else
 			log_note "Dataset does not exist. ($dataset)"
 			return 1
 		fi
 	fi
 
 	return 0
 }
 
 #
 # Reexport TESTPOOL & TESTPOOL(1-4)
 #
 function reexport_pool
 {
 	typeset -i cntctr=5
 	typeset -i i=0
 
 	while ((i < cntctr)); do
 		if ((i == 0)); then
 			TESTPOOL=$ZONE_POOL/$ZONE_CTR$i
 			if ! ismounted $TESTPOOL; then
 				log_must zfs mount $TESTPOOL
 			fi
 		else
 			eval TESTPOOL$i=$ZONE_POOL/$ZONE_CTR$i
 			if eval ! ismounted \$TESTPOOL$i; then
 				log_must eval zfs mount \$TESTPOOL$i
 			fi
 		fi
 		((i += 1))
 	done
 }
 
 #
 # Verify a given disk or pool state
 #
 # Return 0 is pool/disk matches expected state, 1 otherwise
 #
 function check_state # pool disk state{online,offline,degraded}
 {
 	typeset pool=$1
 	typeset disk=${2#$DEV_DSKDIR/}
 	typeset state=$3
 
 	[[ -z $pool ]] || [[ -z $state ]] \
 	    && log_fail "Arguments invalid or missing"
 
 	if [[ -z $disk ]]; then
 		#check pool state only
 		zpool get -H -o value health $pool | grep -qi "$state"
 	else
 		zpool status -v $pool | grep "$disk" | grep -qi "$state"
 	fi
 }
 
 #
 # Get the mountpoint of snapshot
 # For the snapshot use <mp_filesystem>/.zfs/snapshot/<snap>
 # as its mountpoint
 #
 function snapshot_mountpoint
 {
 	typeset dataset=${1:-$TESTPOOL/$TESTFS@$TESTSNAP}
 
 	if [[ $dataset != *@* ]]; then
 		log_fail "Error name of snapshot '$dataset'."
 	fi
 
 	typeset fs=${dataset%@*}
 	typeset snap=${dataset#*@}
 
 	if [[ -z $fs || -z $snap ]]; then
 		log_fail "Error name of snapshot '$dataset'."
 	fi
 
 	echo $(get_prop mountpoint $fs)/.zfs/snapshot/$snap
 }
 
 #
 # Given a device and 'ashift' value verify it's correctly set on every label
 #
 function verify_ashift # device ashift
 {
 	typeset device="$1"
 	typeset ashift="$2"
 
 	zdb -e -lll $device | awk -v ashift=$ashift '
 	    /ashift: / {
 	        if (ashift != $2)
 	            exit 1;
 	        else
 	            count++;
 	    }
 	    END {
 	        exit (count != 4);
 	    }'
 }
 
 #
 # Given a pool and file system, this function will verify the file system
 # using the zdb internal tool. Note that the pool is exported and imported
 # to ensure it has consistent state.
 #
 function verify_filesys # pool filesystem dir
 {
 	typeset pool="$1"
 	typeset filesys="$2"
 	typeset zdbout="/tmp/zdbout.$$"
 
 	shift
 	shift
 	typeset dirs=$@
 	typeset search_path=""
 
 	log_note "Calling zdb to verify filesystem '$filesys'"
 	zfs unmount -a > /dev/null 2>&1
 	log_must zpool export $pool
 
 	if [[ -n $dirs ]] ; then
 		for dir in $dirs ; do
 			search_path="$search_path -d $dir"
 		done
 	fi
 
 	log_must zpool import $search_path $pool
 
 	if ! zdb -cudi $filesys > $zdbout 2>&1; then
 		log_note "Output: zdb -cudi $filesys"
 		cat $zdbout
 		rm -f $zdbout
 		log_fail "zdb detected errors with: '$filesys'"
 	fi
 
 	log_must zfs mount -a
 	log_must rm -rf $zdbout
 }
 
 #
 # Given a pool issue a scrub and verify that no checksum errors are reported.
 #
 function verify_pool
 {
 	typeset pool=${1:-$TESTPOOL}
 
 	log_must zpool scrub $pool
 	log_must wait_scrubbed $pool
 
 	typeset -i cksum=$(zpool status $pool | awk '
 	    !NF { isvdev = 0 }
 	    isvdev { errors += $NF }
 	    /CKSUM$/ { isvdev = 1 }
 	    END { print errors }
 	')
 	if [[ $cksum != 0 ]]; then
 		log_must zpool status -v
 	        log_fail "Unexpected CKSUM errors found on $pool ($cksum)"
 	fi
 }
 
 #
 # Given a pool, and this function list all disks in the pool
 #
 function get_disklist # pool
 {
 	echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \
 	    grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$")
 }
 
 #
 # Given a pool, and this function list all disks in the pool with their full
 # path (like "/dev/sda" instead of "sda").
 #
 function get_disklist_fullpath # pool
 {
 	get_disklist "-P $1"
 }
 
 
 
 # /**
 #  This function kills a given list of processes after a time period. We use
 #  this in the stress tests instead of STF_TIMEOUT so that we can have processes
 #  run for a fixed amount of time, yet still pass. Tests that hit STF_TIMEOUT
 #  would be listed as FAIL, which we don't want : we're happy with stress tests
 #  running for a certain amount of time, then finishing.
 #
 # @param $1 the time in seconds after which we should terminate these processes
 # @param $2..$n the processes we wish to terminate.
 # */
 function stress_timeout
 {
 	typeset -i TIMEOUT=$1
 	shift
 	typeset cpids="$@"
 
 	log_note "Waiting for child processes($cpids). " \
 		"It could last dozens of minutes, please be patient ..."
 	log_must sleep $TIMEOUT
 
 	log_note "Killing child processes after ${TIMEOUT} stress timeout."
 	typeset pid
 	for pid in $cpids; do
 		ps -p $pid > /dev/null 2>&1 &&
 			log_must kill -USR1 $pid
 	done
 }
 
 #
 # Verify a given hotspare disk is inuse or avail
 #
 # Return 0 is pool/disk matches expected state, 1 otherwise
 #
 function check_hotspare_state # pool disk state{inuse,avail}
 {
 	typeset pool=$1
 	typeset disk=${2#$DEV_DSKDIR/}
 	typeset state=$3
 
 	cur_state=$(get_device_state $pool $disk "spares")
 
 	[ $state = $cur_state ]
 }
 
 #
 # Wait until a hotspare transitions to a given state or times out.
 #
 # Return 0 when  pool/disk matches expected state, 1 on timeout.
 #
 function wait_hotspare_state # pool disk state timeout
 {
 	typeset pool=$1
 	typeset disk=${2#*$DEV_DSKDIR/}
 	typeset state=$3
 	typeset timeout=${4:-60}
 	typeset -i i=0
 
 	while [[ $i -lt $timeout ]]; do
 		if check_hotspare_state $pool $disk $state; then
 			return 0
 		fi
 
 		i=$((i+1))
 		sleep 1
 	done
 
 	return 1
 }
 
 #
 # Verify a given vdev disk is inuse or avail
 #
 # Return 0 is pool/disk matches expected state, 1 otherwise
 #
 function check_vdev_state # pool disk state{online,offline,unavail,removed}
 {
 	typeset pool=$1
 	typeset disk=${2#*$DEV_DSKDIR/}
 	typeset state=$3
 
 	cur_state=$(get_device_state $pool $disk)
 
 	[ $state = $cur_state ]
 }
 
 #
 # Wait until a vdev transitions to a given state or times out.
 #
 # Return 0 when  pool/disk matches expected state, 1 on timeout.
 #
 function wait_vdev_state # pool disk state timeout
 {
 	typeset pool=$1
 	typeset disk=${2#*$DEV_DSKDIR/}
 	typeset state=$3
 	typeset timeout=${4:-60}
 	typeset -i i=0
 
 	while [[ $i -lt $timeout ]]; do
 		if check_vdev_state $pool $disk $state; then
 			return 0
 		fi
 
 		i=$((i+1))
 		sleep 1
 	done
 
 	return 1
 }
 
 #
 # Check the output of 'zpool status -v <pool>',
 # and to see if the content of <token> contain the <keyword> specified.
 #
 # Return 0 is contain, 1 otherwise
 #
 function check_pool_status # pool token keyword <verbose>
 {
 	typeset pool=$1
 	typeset token=$2
 	typeset keyword=$3
 	typeset verbose=${4:-false}
 
 	scan=$(zpool status -v "$pool" 2>/dev/null | awk -v token="$token:" '$1==token')
 	if [[ $verbose == true ]]; then
 		log_note $scan
 	fi
 	echo $scan | grep -qi "$keyword"
 }
 
 #
 # The following functions are instance of check_pool_status()
 #	is_pool_resilvering - to check if the pool resilver is in progress
 #	is_pool_resilvered - to check if the pool resilver is completed
 #	is_pool_scrubbing - to check if the pool scrub is in progress
 #	is_pool_scrubbed - to check if the pool scrub is completed
 #	is_pool_scrub_stopped - to check if the pool scrub is stopped
 #	is_pool_scrub_paused - to check if the pool scrub has paused
 #	is_pool_removing - to check if the pool removing is a vdev
 #	is_pool_removed - to check if the pool remove is completed
 #	is_pool_discarding - to check if the pool checkpoint is being discarded
 #	is_pool_replacing - to check if the pool is performing a replacement
 #
 function is_pool_resilvering #pool <verbose>
 {
 	check_pool_status "$1" "scan" \
 	    "resilver[ ()0-9A-Za-z:_-]* in progress since" $2
 }
 
 function is_pool_resilvered #pool <verbose>
 {
 	check_pool_status "$1" "scan" "resilvered " $2
 }
 
 function is_pool_scrubbing #pool <verbose>
 {
 	check_pool_status "$1" "scan" "scrub in progress since " $2
 }
 
 function is_pool_error_scrubbing #pool <verbose>
 {
 	check_pool_status "$1" "scrub" "error scrub in progress since " $2
 	return $?
 }
 
 function is_pool_scrubbed #pool <verbose>
 {
 	check_pool_status "$1" "scan" "scrub repaired" $2
 }
 
 function is_pool_scrub_stopped #pool <verbose>
 {
 	check_pool_status "$1" "scan" "scrub canceled" $2
 }
 
 function is_pool_error_scrub_stopped #pool <verbose>
 {
 	check_pool_status "$1" "scrub" "error scrub canceled on " $2
 	return $?
 }
 
 function is_pool_scrub_paused #pool <verbose>
 {
 	check_pool_status "$1" "scan" "scrub paused since " $2
 }
 
 function is_pool_error_scrub_paused #pool <verbose>
 {
 	check_pool_status "$1" "scrub" "error scrub paused since " $2
 	return $?
 }
 
 function is_pool_removing #pool
 {
 	check_pool_status "$1" "remove" "in progress since "
 }
 
 function is_pool_removed #pool
 {
 	check_pool_status "$1" "remove" "completed on"
 }
 
 function is_pool_discarding #pool
 {
 	check_pool_status "$1" "checkpoint" "discarding"
 }
 function is_pool_replacing #pool
 {
 	zpool status "$1" | grep -qE 'replacing-[0-9]+'
 }
 
 function wait_for_degraded
 {
 	typeset pool=$1
 	typeset timeout=${2:-30}
 	typeset t0=$SECONDS
 
 	while :; do
 		[[ $(get_pool_prop health $pool) == "DEGRADED" ]] && break
 		log_note "$pool is not yet degraded."
 		sleep 1
 		if ((SECONDS - t0 > $timeout)); then
 			log_note "$pool not degraded after $timeout seconds."
 			return 1
 		fi
 	done
 
 	return 0
 }
 
 #
 # Use create_pool()/destroy_pool() to clean up the information in
 # in the given disk to avoid slice overlapping.
 #
 function cleanup_devices #vdevs
 {
 	typeset pool="foopool$$"
 
 	for vdev in $@; do
 		zero_partitions $vdev
 	done
 
 	poolexists $pool && destroy_pool $pool
 	create_pool $pool $@
 	destroy_pool $pool
 
 	return 0
 }
 
 #/**
 # A function to find and locate free disks on a system or from given
 # disks as the parameter. It works by locating disks that are in use
 # as swap devices and dump devices, and also disks listed in /etc/vfstab
 #
 # $@ given disks to find which are free, default is all disks in
 # the test system
 #
 # @return a string containing the list of available disks
 #*/
 function find_disks
 {
 	# Trust provided list, no attempt is made to locate unused devices.
 	if is_linux || is_freebsd; then
 		echo "$@"
 		return
 	fi
 
 
 	sfi=/tmp/swaplist.$$
 	dmpi=/tmp/dumpdev.$$
 	max_finddisksnum=${MAX_FINDDISKSNUM:-6}
 
 	swap -l > $sfi
 	dumpadm > $dmpi 2>/dev/null
 
 	disks=${@:-$(echo "" | format -e 2>/dev/null | awk '
 BEGIN { FS="."; }
 
 /^Specify disk/{
 	searchdisks=0;
 }
 
 {
 	if (searchdisks && $2 !~ "^$"){
 		split($2,arr," ");
 		print arr[1];
 	}
 }
 
 /^AVAILABLE DISK SELECTIONS:/{
 	searchdisks=1;
 }
 ')}
 
 	unused=""
 	for disk in $disks; do
 	# Check for mounted
 		grep -q "${disk}[sp]" /etc/mnttab && continue
 	# Check for swap
 		grep -q "${disk}[sp]" $sfi && continue
 	# check for dump device
 		grep -q "${disk}[sp]" $dmpi && continue
 	# check to see if this disk hasn't been explicitly excluded
 	# by a user-set environment variable
 		echo "${ZFS_HOST_DEVICES_IGNORE}" | grep -q "${disk}" && continue
 		unused_candidates="$unused_candidates $disk"
 	done
 	rm $sfi $dmpi
 
 # now just check to see if those disks do actually exist
 # by looking for a device pointing to the first slice in
 # each case. limit the number to max_finddisksnum
 	count=0
 	for disk in $unused_candidates; do
 		if is_disk_device $DEV_DSKDIR/${disk}s0 && \
 		    [ $count -lt $max_finddisksnum ]; then
 			unused="$unused $disk"
 			# do not impose limit if $@ is provided
 			[[ -z $@ ]] && ((count = count + 1))
 		fi
 	done
 
 # finally, return our disk list
 	echo $unused
 }
 
 function add_user_freebsd #<group_name> <user_name> <basedir>
 {
 	typeset group=$1
 	typeset user=$2
 	typeset basedir=$3
 
 	# Check to see if the user exists.
 	if id $user > /dev/null 2>&1; then
 		return 0
 	fi
 
 	# Assign 1000 as the base uid
 	typeset -i uid=1000
 	while true; do
 		pw useradd -u $uid -g $group -d $basedir/$user -m -n $user
 		case $? in
 			0) break ;;
 			# The uid is not unique
 			65) ((uid += 1)) ;;
 			*) return 1 ;;
 		esac
 		if [[ $uid == 65000 ]]; then
 			log_fail "No user id available under 65000 for $user"
 		fi
 	done
 
 	# Silence MOTD
 	touch $basedir/$user/.hushlogin
 
 	return 0
 }
 
 #
 # Delete the specified user.
 #
 # $1 login name
 #
 function del_user_freebsd #<logname>
 {
 	typeset user=$1
 
 	if id $user > /dev/null 2>&1; then
 		log_must pw userdel $user
 	fi
 
 	return 0
 }
 
 #
 # Select valid gid and create specified group.
 #
 # $1 group name
 #
 function add_group_freebsd #<group_name>
 {
 	typeset group=$1
 
 	# See if the group already exists.
 	if pw groupshow $group >/dev/null 2>&1; then
 		return 0
 	fi
 
 	# Assign 1000 as the base gid
 	typeset -i gid=1000
 	while true; do
 		pw groupadd -g $gid -n $group > /dev/null 2>&1
 		case $? in
 			0) return 0 ;;
 			# The gid is not  unique
 			65) ((gid += 1)) ;;
 			*) return 1 ;;
 		esac
 		if [[ $gid == 65000 ]]; then
 			log_fail "No user id available under 65000 for $group"
 		fi
 	done
 }
 
 #
 # Delete the specified group.
 #
 # $1 group name
 #
 function del_group_freebsd #<group_name>
 {
 	typeset group=$1
 
 	pw groupdel -n $group > /dev/null 2>&1
 	case $? in
 		# Group does not exist, or was deleted successfully.
 		0|6|65) return 0 ;;
 		# Name already exists as a group name
 		9) log_must pw groupdel $group ;;
 		*) return 1 ;;
 	esac
 
 	return 0
 }
 
 function add_user_illumos #<group_name> <user_name> <basedir>
 {
 	typeset group=$1
 	typeset user=$2
 	typeset basedir=$3
 
 	log_must useradd -g $group -d $basedir/$user -m $user
 
 	return 0
 }
 
 function del_user_illumos #<user_name>
 {
 	typeset user=$1
 
 	if id $user > /dev/null 2>&1; then
 		log_must_retry "currently used" 6 userdel $user
 	fi
 
 	return 0
 }
 
 function add_group_illumos #<group_name>
 {
 	typeset group=$1
 
 	typeset -i gid=100
 	while true; do
 		groupadd -g $gid $group > /dev/null 2>&1
 		case $? in
 			0) return 0 ;;
 			# The gid is not  unique
 			4) ((gid += 1)) ;;
 			*) return 1 ;;
 		esac
 	done
 }
 
 function del_group_illumos #<group_name>
 {
 	typeset group=$1
 
 	groupmod -n $grp $grp > /dev/null 2>&1
 	case $? in
 		# Group does not exist.
 		6) return 0 ;;
 		# Name already exists as a group name
 		9) log_must groupdel $grp ;;
 		*) return 1 ;;
 	esac
 }
 
 function add_user_linux #<group_name> <user_name> <basedir>
 {
 	typeset group=$1
 	typeset user=$2
 	typeset basedir=$3
 
 	log_must useradd -g $group -d $basedir/$user -m $user
 
 	# Add new users to the same group and the command line utils.
 	# This allows them to be run out of the original users home
 	# directory as long as it permissioned to be group readable.
 	cmd_group=$(stat --format="%G" $(command -v zfs))
 	log_must usermod -a -G $cmd_group $user
 
 	return 0
 }
 
 function del_user_linux #<user_name>
 {
 	typeset user=$1
 
 	if id $user > /dev/null 2>&1; then
 		log_must_retry "currently used" 6 userdel $user
 	fi
 }
 
 function add_group_linux #<group_name>
 {
 	typeset group=$1
 
 	# Assign 100 as the base gid, a larger value is selected for
 	# Linux because for many distributions 1000 and under are reserved.
 	while true; do
 		groupadd $group > /dev/null 2>&1
 		case $? in
 			0) return 0 ;;
 			*) return 1 ;;
 		esac
 	done
 }
 
 function del_group_linux #<group_name>
 {
 	typeset group=$1
 
 	getent group $group > /dev/null 2>&1
 	case $? in
 		# Group does not exist.
 		2) return 0 ;;
 		# Name already exists as a group name
 		0) log_must groupdel $group ;;
 		*) return 1 ;;
 	esac
 
 	return 0
 }
 
 #
 # Add specified user to specified group
 #
 # $1 group name
 # $2 user name
 # $3 base of the homedir (optional)
 #
 function add_user #<group_name> <user_name> <basedir>
 {
 	typeset group=$1
 	typeset user=$2
 	typeset basedir=${3:-"/var/tmp"}
 
 	if ((${#group} == 0 || ${#user} == 0)); then
 		log_fail "group name or user name are not defined."
 	fi
 
 	case "$UNAME" in
 	FreeBSD)
 		add_user_freebsd "$group" "$user" "$basedir"
 		;;
 	Linux)
 		add_user_linux "$group" "$user" "$basedir"
 		;;
 	*)
 		add_user_illumos "$group" "$user" "$basedir"
 		;;
 	esac
 
 	return 0
 }
 
 #
 # Delete the specified user.
 #
 # $1 login name
 # $2 base of the homedir (optional)
 #
 function del_user #<logname> <basedir>
 {
 	typeset user=$1
 	typeset basedir=${2:-"/var/tmp"}
 
 	if ((${#user} == 0)); then
 		log_fail "login name is necessary."
 	fi
 
 	case "$UNAME" in
 	FreeBSD)
 		del_user_freebsd "$user"
 		;;
 	Linux)
 		del_user_linux "$user"
 		;;
 	*)
 		del_user_illumos "$user"
 		;;
 	esac
 
 	[[ -d $basedir/$user ]] && rm -fr $basedir/$user
 
 	return 0
 }
 
 #
 # Select valid gid and create specified group.
 #
 # $1 group name
 #
 function add_group #<group_name>
 {
 	typeset group=$1
 
 	if ((${#group} == 0)); then
 		log_fail "group name is necessary."
 	fi
 
 	case "$UNAME" in
 	FreeBSD)
 		add_group_freebsd "$group"
 		;;
 	Linux)
 		add_group_linux "$group"
 		;;
 	*)
 		add_group_illumos "$group"
 		;;
 	esac
 
 	return 0
 }
 
 #
 # Delete the specified group.
 #
 # $1 group name
 #
 function del_group #<group_name>
 {
 	typeset group=$1
 
 	if ((${#group} == 0)); then
 		log_fail "group name is necessary."
 	fi
 
 	case "$UNAME" in
 	FreeBSD)
 		del_group_freebsd "$group"
 		;;
 	Linux)
 		del_group_linux "$group"
 		;;
 	*)
 		del_group_illumos "$group"
 		;;
 	esac
 
 	return 0
 }
 
 #
 # This function will return true if it's safe to destroy the pool passed
 # as argument 1. It checks for pools based on zvols and files, and also
 # files contained in a pool that may have a different mountpoint.
 #
 function safe_to_destroy_pool { # $1 the pool name
 
 	typeset pool=""
 	typeset DONT_DESTROY=""
 
 	# We check that by deleting the $1 pool, we're not
 	# going to pull the rug out from other pools. Do this
 	# by looking at all other pools, ensuring that they
 	# aren't built from files or zvols contained in this pool.
 
 	for pool in $(zpool list -H -o name)
 	do
 		ALTMOUNTPOOL=""
 
 		# this is a list of the top-level directories in each of the
 		# files that make up the path to the files the pool is based on
 		FILEPOOL=$(zpool status -v $pool | awk -v pool="/$1/" '$0 ~ pool {print $1}')
 
 		# this is a list of the zvols that make up the pool
 		ZVOLPOOL=$(zpool status -v $pool | awk -v zvols="$ZVOL_DEVDIR/$1$" '$0 ~ zvols {print $1}')
 
 		# also want to determine if it's a file-based pool using an
 		# alternate mountpoint...
 		POOL_FILE_DIRS=$(zpool status -v $pool | \
 					awk '/\// {print $1}' | \
 					awk -F/ '!/dev/ {print $2}')
 
 		for pooldir in $POOL_FILE_DIRS
 		do
 			OUTPUT=$(zfs list -H -r -o mountpoint $1 | \
 					awk -v pd="${pooldir}$" '$0 ~ pd {print $1}')
 
 			ALTMOUNTPOOL="${ALTMOUNTPOOL}${OUTPUT}"
 		done
 
 
 		if [ ! -z "$ZVOLPOOL" ]
 		then
 			DONT_DESTROY="true"
 			log_note "Pool $pool is built from $ZVOLPOOL on $1"
 		fi
 
 		if [ ! -z "$FILEPOOL" ]
 		then
 			DONT_DESTROY="true"
 			log_note "Pool $pool is built from $FILEPOOL on $1"
 		fi
 
 		if [ ! -z "$ALTMOUNTPOOL" ]
 		then
 			DONT_DESTROY="true"
 			log_note "Pool $pool is built from $ALTMOUNTPOOL on $1"
 		fi
 	done
 
 	if [ -z "${DONT_DESTROY}" ]
 	then
 		return 0
 	else
 		log_note "Warning: it is not safe to destroy $1!"
 		return 1
 	fi
 }
 
 #
 # Verify zfs operation with -p option work as expected
 # $1 operation, value could be create, clone or rename
 # $2 dataset type, value could be fs or vol
 # $3 dataset name
 # $4 new dataset name
 #
 function verify_opt_p_ops
 {
 	typeset ops=$1
 	typeset datatype=$2
 	typeset dataset=$3
 	typeset newdataset=$4
 
 	if [[ $datatype != "fs" && $datatype != "vol" ]]; then
 		log_fail "$datatype is not supported."
 	fi
 
 	# check parameters accordingly
 	case $ops in
 		create)
 			newdataset=$dataset
 			dataset=""
 			if [[ $datatype == "vol" ]]; then
 				ops="create -V $VOLSIZE"
 			fi
 			;;
 		clone)
 			if [[ -z $newdataset ]]; then
 				log_fail "newdataset should not be empty" \
 					"when ops is $ops."
 			fi
 			log_must datasetexists $dataset
 			log_must snapexists $dataset
 			;;
 		rename)
 			if [[ -z $newdataset ]]; then
 				log_fail "newdataset should not be empty" \
 					"when ops is $ops."
 			fi
 			log_must datasetexists $dataset
 			;;
 		*)
 			log_fail "$ops is not supported."
 			;;
 	esac
 
 	# make sure the upper level filesystem does not exist
 	destroy_dataset "${newdataset%/*}" "-rRf"
 
 	# without -p option, operation will fail
 	log_mustnot zfs $ops $dataset $newdataset
 	log_mustnot datasetexists $newdataset ${newdataset%/*}
 
 	# with -p option, operation should succeed
 	log_must zfs $ops -p $dataset $newdataset
 	block_device_wait
 
 	if ! datasetexists $newdataset ; then
 		log_fail "-p option does not work for $ops"
 	fi
 
 	# when $ops is create or clone, redo the operation still return zero
 	if [[ $ops != "rename" ]]; then
 		log_must zfs $ops -p $dataset $newdataset
 	fi
 
 	return 0
 }
 
 #
 # Get configuration of pool
 # $1 pool name
 # $2 config name
 #
 function get_config
 {
 	typeset pool=$1
 	typeset config=$2
 
 	if ! poolexists "$pool" ; then
 		return 1
 	fi
 	if [ "$(get_pool_prop cachefile "$pool")" = "none" ]; then
 		zdb -e $pool
 	else
 		zdb -C $pool
 	fi | awk -F: -v cfg="$config:" '$0 ~ cfg {sub(/^'\''/, $2); sub(/'\''$/, $2); print $2}'
 }
 
 #
 # Privated function. Random select one of items from arguments.
 #
 # $1 count
 # $2-n string
 #
 function _random_get
 {
 	typeset cnt=$1
 	shift
 
 	typeset str="$@"
 	typeset -i ind
 	((ind = RANDOM % cnt + 1))
 
 	echo "$str" | cut -f $ind -d ' '
 }
 
 #
 # Random select one of item from arguments which include NONE string
 #
 function random_get_with_non
 {
 	typeset -i cnt=$#
 	((cnt =+ 1))
 
 	_random_get "$cnt" "$@"
 }
 
 #
 # Random select one of item from arguments which doesn't include NONE string
 #
 function random_get
 {
 	_random_get "$#" "$@"
 }
 
 #
 # The function will generate a dataset name with specific length
 # $1, the length of the name
 # $2, the base string to construct the name
 #
 function gen_dataset_name
 {
 	typeset -i len=$1
 	typeset basestr="$2"
 	typeset -i baselen=${#basestr}
 	typeset -i iter=0
 	typeset l_name=""
 
 	if ((len % baselen == 0)); then
 		((iter = len / baselen))
 	else
 		((iter = len / baselen + 1))
 	fi
 	while ((iter > 0)); do
 		l_name="${l_name}$basestr"
 
 		((iter -= 1))
 	done
 
 	echo $l_name
 }
 
 #
 # Get cksum tuple of dataset
 # $1 dataset name
 #
 # sample zdb output:
 # Dataset data/test [ZPL], ID 355, cr_txg 2413856, 31.0K, 7 objects, rootbp
 # DVA[0]=<0:803046400:200> DVA[1]=<0:81199000:200> [L0 DMU objset] fletcher4
 # lzjb LE contiguous unique double size=800L/200P birth=2413856L/2413856P
 # fill=7 cksum=11ce125712:643a9c18ee2:125e25238fca0:254a3f74b59744
 function datasetcksum
 {
 	typeset cksum
 	sync
 	sync_all_pools
 	zdb -vvv $1 | awk -F= -v ds="^Dataset $1 "'\\[' '$0 ~ ds && /cksum/ {print $7}'
 }
 
 #
 # Get the given disk/slice state from the specific field of the pool
 #
 function get_device_state #pool disk field("", "spares","logs")
 {
 	typeset pool=$1
 	typeset disk=${2#$DEV_DSKDIR/}
 	typeset field=${3:-$pool}
 
 	zpool status -v "$pool" 2>/dev/null | \
 		awk -v device=$disk -v pool=$pool -v field=$field \
 		'BEGIN {startconfig=0; startfield=0; }
 		/config:/ {startconfig=1}
 		(startconfig==1) && ($1==field) {startfield=1; next;}
 		(startfield==1) && ($1==device) {print $2; exit;}
 		(startfield==1) &&
 		($1==field || $1 ~ "^spares$" || $1 ~ "^logs$") {startfield=0}'
 }
 
 #
 # get the root filesystem name if it's zfsroot system.
 #
 # return: root filesystem name
 function get_rootfs
 {
 	typeset rootfs=""
 
 	if is_freebsd; then
 		rootfs=$(mount -p | awk '$2 == "/" && $3 == "zfs" {print $1}')
 	elif ! is_linux; then
 		rootfs=$(awk '$2 == "/" && $3 == "zfs" {print $1}' \
 			/etc/mnttab)
 	fi
 	if [[ -z "$rootfs" ]]; then
 		log_fail "Can not get rootfs"
 	fi
 	if datasetexists $rootfs; then
 		echo $rootfs
 	else
 		log_fail "This is not a zfsroot system."
 	fi
 }
 
 #
 # get the rootfs's pool name
 # return:
 #       rootpool name
 #
 function get_rootpool
 {
 	typeset rootfs=$(get_rootfs)
 	echo ${rootfs%%/*}
 }
 
 #
 # To verify if the require numbers of disks is given
 #
 function verify_disk_count
 {
 	typeset -i min=${2:-1}
 
 	typeset -i count=$(echo "$1" | wc -w)
 
 	if ((count < min)); then
 		log_untested "A minimum of $min disks is required to run." \
 			" You specified $count disk(s)"
 	fi
 }
 
 function ds_is_volume
 {
 	typeset type=$(get_prop type $1)
 	[ $type = "volume" ]
 }
 
 function ds_is_filesystem
 {
 	typeset type=$(get_prop type $1)
 	[ $type = "filesystem" ]
 }
 
 #
 # Check if Trusted Extensions are installed and enabled
 #
 function is_te_enabled
 {
 	svcs -H -o state labeld 2>/dev/null | grep -q "enabled"
 }
 
 # Return the number of CPUs (cross-platform)
 function get_num_cpus
 {
 	if is_linux ; then
 		grep -c '^processor' /proc/cpuinfo
 	elif is_freebsd; then
 		sysctl -n kern.smp.cpus
 	else
 		psrinfo | wc -l
 	fi
 }
 
 # Utility function to determine if a system has multiple cpus.
 function is_mp
 {
 	[[ $(get_num_cpus) -gt 1 ]]
 }
 
 function get_cpu_freq
 {
 	if is_linux; then
 		lscpu | awk '/CPU MHz/ { print $3 }'
 	elif is_freebsd; then
 		sysctl -n hw.clockrate
 	else
 		psrinfo -v 0 | awk '/processor operates at/ {print $6}'
 	fi
 }
 
 # Run the given command as the user provided.
 function user_run
 {
 	typeset user=$1
 	shift
 
 	log_note "user: $user"
 	log_note "cmd: $*"
 
 	typeset out=$TEST_BASE_DIR/out
 	typeset err=$TEST_BASE_DIR/err
 
 	sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err
 	typeset res=$?
 	log_note "out: $(<$out)"
 	log_note "err: $(<$err)"
 	return $res
 }
 
 #
 # Check if the pool contains the specified vdevs
 #
 # $1 pool
 # $2..n <vdev> ...
 #
 # Return 0 if the vdevs are contained in the pool, 1 if any of the specified
 # vdevs is not in the pool, and 2 if pool name is missing.
 #
 function vdevs_in_pool
 {
 	typeset pool=$1
 	typeset vdev
 
 	if [[ -z $pool ]]; then
 		log_note "Missing pool name."
 		return 2
 	fi
 
 	shift
 
 	# We could use 'zpool list' to only get the vdevs of the pool but we
 	# can't reference a mirror/raidz vdev using its ID (i.e mirror-0),
 	# therefore we use the 'zpool status' output.
 	typeset tmpfile=$(mktemp)
 	zpool status -v "$pool" | grep -A 1000 "config:" >$tmpfile
 	for vdev in "$@"; do
 		grep -wq ${vdev##*/} $tmpfile || return 1
 	done
 
 	rm -f $tmpfile
 	return 0
 }
 
 function get_max
 {
 	typeset -l i max=$1
 	shift
 
 	for i in "$@"; do
 		max=$((max > i ? max : i))
 	done
 
 	echo $max
 }
 
 # Write data that can be compressed into a directory
 function write_compressible
 {
 	typeset dir=$1
 	typeset megs=$2
 	typeset nfiles=${3:-1}
 	typeset bs=${4:-1024k}
 	typeset fname=${5:-file}
 
 	[[ -d $dir ]] || log_fail "No directory: $dir"
 
 	# Under Linux fio is not currently used since its behavior can
 	# differ significantly across versions.  This includes missing
 	# command line options and cases where the --buffer_compress_*
 	# options fail to behave as expected.
 	if is_linux; then
 		typeset file_bytes=$(to_bytes $megs)
 		typeset bs_bytes=4096
 		typeset blocks=$(($file_bytes / $bs_bytes))
 
 		for (( i = 0; i < $nfiles; i++ )); do
 			truncate -s $file_bytes $dir/$fname.$i
 
 			# Write every third block to get 66% compression.
 			for (( j = 0; j < $blocks; j += 3 )); do
 				dd if=/dev/urandom of=$dir/$fname.$i \
 				    seek=$j bs=$bs_bytes count=1 \
 				    conv=notrunc >/dev/null 2>&1
 			done
 		done
 	else
 		command -v fio > /dev/null || log_unsupported "fio missing"
 		log_must eval fio \
 		    --name=job \
 		    --fallocate=0 \
 		    --minimal \
 		    --randrepeat=0 \
 		    --buffer_compress_percentage=66 \
 		    --buffer_compress_chunk=4096 \
 		    --directory="$dir" \
 		    --numjobs="$nfiles" \
 		    --nrfiles="$nfiles" \
 		    --rw=write \
 		    --bs="$bs" \
 		    --filesize="$megs" \
 		    "--filename_format='$fname.\$jobnum' >/dev/null"
 	fi
 }
 
 function get_objnum
 {
 	typeset pathname=$1
 	typeset objnum
 
 	[[ -e $pathname ]] || log_fail "No such file or directory: $pathname"
 	if is_freebsd; then
 		objnum=$(stat -f "%i" $pathname)
 	else
 		objnum=$(stat -c %i $pathname)
 	fi
 	echo $objnum
 }
 
 #
 # Sync data to the pool
 #
 # $1 pool name
 # $2 boolean to force uberblock (and config including zpool cache file) update
 #
 function sync_pool #pool <force>
 {
 	typeset pool=${1:-$TESTPOOL}
 	typeset force=${2:-false}
 
 	if [[ $force == true ]]; then
 		log_must zpool sync -f $pool
 	else
 		log_must zpool sync $pool
 	fi
 
 	return 0
 }
 
 #
 # Sync all pools
 #
 # $1 boolean to force uberblock (and config including zpool cache file) update
 #
 function sync_all_pools #<force>
 {
 	typeset force=${1:-false}
 
 	if [[ $force == true ]]; then
 		log_must zpool sync -f
 	else
 		log_must zpool sync
 	fi
 
 	return 0
 }
 
 #
 # Wait for zpool 'freeing' property drops to zero.
 #
 # $1 pool name
 #
 function wait_freeing #pool
 {
 	typeset pool=${1:-$TESTPOOL}
 	while true; do
 		[[ "0" == "$(zpool list -Ho freeing $pool)" ]] && break
 		log_must sleep 1
 	done
 }
 
 #
 # Wait for every device replace operation to complete
 #
 # $1 pool name
 # $2 timeout
 #
 function wait_replacing #pool timeout
 {
 	typeset timeout=${2:-300}
 	typeset pool=${1:-$TESTPOOL}
 	for (( timer = 0; timer < $timeout; timer++ )); do
 		is_pool_replacing $pool || break;
 		sleep 1;
 	done
 }
 
 # Wait for a pool to be scrubbed
 #
 # $1 pool name
 # $2 timeout
 #
 function wait_scrubbed #pool timeout
 {
        typeset timeout=${2:-300}
        typeset pool=${1:-$TESTPOOL}
        for (( timer = 0; timer < $timeout; timer++ )); do
                is_pool_scrubbed $pool && break;
                sleep 1;
        done
 }
 
 # Backup the zed.rc in our test directory so that we can edit it for our test.
 #
 # Returns: Backup file name.  You will need to pass this to zed_rc_restore().
 function zed_rc_backup
 {
 	zedrc_backup="$(mktemp)"
 	cp $ZEDLET_DIR/zed.rc $zedrc_backup
 	echo $zedrc_backup
 }
 
 function zed_rc_restore
 {
 	mv $1 $ZEDLET_DIR/zed.rc
 }
 
 #
 # Setup custom environment for the ZED.
 #
 # $@ Optional list of zedlets to run under zed.
 function zed_setup
 {
 	if ! is_linux; then
 		log_unsupported "No zed on $UNAME"
 	fi
 
 	if [[ ! -d $ZEDLET_DIR ]]; then
 		log_must mkdir $ZEDLET_DIR
 	fi
 
 	if [[ ! -e $VDEVID_CONF ]]; then
 		log_must touch $VDEVID_CONF
 	fi
 
 	if [[ -e $VDEVID_CONF_ETC ]]; then
 		log_fail "Must not have $VDEVID_CONF_ETC file present on system"
 	fi
 	EXTRA_ZEDLETS=$@
 
 	# Create a symlink for /etc/zfs/vdev_id.conf file.
 	log_must ln -s $VDEVID_CONF $VDEVID_CONF_ETC
 
 	# Setup minimal ZED configuration.  Individual test cases should
 	# add additional ZEDLETs as needed for their specific test.
 	log_must cp ${ZEDLET_ETC_DIR}/zed.rc $ZEDLET_DIR
 	log_must cp ${ZEDLET_ETC_DIR}/zed-functions.sh $ZEDLET_DIR
 
 	# Scripts must only be user writable.
 	if [[ -n "$EXTRA_ZEDLETS" ]] ; then
 		saved_umask=$(umask)
 		log_must umask 0022
 		for i in $EXTRA_ZEDLETS ; do
 			log_must cp ${ZEDLET_LIBEXEC_DIR}/$i $ZEDLET_DIR
 		done
 		log_must umask $saved_umask
 	fi
 
 	# Customize the zed.rc file to enable the full debug log.
 	log_must sed -i '/\#ZED_DEBUG_LOG=.*/d' $ZEDLET_DIR/zed.rc
 	echo "ZED_DEBUG_LOG=$ZED_DEBUG_LOG" >>$ZEDLET_DIR/zed.rc
 
 }
 
 #
 # Cleanup custom ZED environment.
 #
 # $@ Optional list of zedlets to remove from our test zed.d directory.
 function zed_cleanup
 {
 	if ! is_linux; then
 		return
 	fi
 
 	for extra_zedlet; do
 		log_must rm -f ${ZEDLET_DIR}/$extra_zedlet
 	done
 	log_must rm -fd ${ZEDLET_DIR}/zed.rc ${ZEDLET_DIR}/zed-functions.sh ${ZEDLET_DIR}/all-syslog.sh ${ZEDLET_DIR}/all-debug.sh ${ZEDLET_DIR}/state \
 	                $ZED_LOG $ZED_DEBUG_LOG $VDEVID_CONF_ETC $VDEVID_CONF \
 	                $ZEDLET_DIR
 }
 
 #
 # Check if ZED is currently running; if so, returns PIDs
 #
 function zed_check
 {
 	if ! is_linux; then
 		return
 	fi
 	zedpids="$(pgrep -x zed)"
 	zedpids2="$(pgrep -x lt-zed)"
 	echo ${zedpids} ${zedpids2}
 }
 
 #
 # Check if ZED is currently running, if not start ZED.
 #
 function zed_start
 {
 	if ! is_linux; then
 		return
 	fi
 
 	# ZEDLET_DIR=/var/tmp/zed
 	if [[ ! -d $ZEDLET_DIR ]]; then
 		log_must mkdir $ZEDLET_DIR
 	fi
 
 	# Verify the ZED is not already running.
 	zedpids=$(zed_check)
 	if [ -n "$zedpids" ]; then
 		# We never, ever, really want it to just keep going if zed
 		# is already running - usually this implies our test cases
 		# will break very strangely because whatever we wanted to
 		# configure zed for won't be listening to our changes in the
 		# tmpdir
 		log_fail "ZED already running - ${zedpids}"
 	else
 		log_note "Starting ZED"
 		# run ZED in the background and redirect foreground logging
 		# output to $ZED_LOG.
 		log_must truncate -s 0 $ZED_DEBUG_LOG
 		log_must eval "zed -vF -d $ZEDLET_DIR -P $PATH" \
 		    "-s $ZEDLET_DIR/state -j 1 2>$ZED_LOG &"
 	fi
 
 	return 0
 }
 
 #
 # Kill ZED process
 #
 function zed_stop
 {
 	if ! is_linux; then
 		return ""
 	fi
 
 	log_note "Stopping ZED"
 	while true; do
 		zedpids=$(zed_check)
 		[ ! -n "$zedpids" ] && break
 
 		log_must kill $zedpids
 		sleep 1
 	done
 	return 0
 }
 
 #
 # Drain all zevents
 #
 function zed_events_drain
 {
 	while [ $(zpool events -H | wc -l) -ne 0 ]; do
 		sleep 1
 		zpool events -c >/dev/null
 	done
 }
 
 # Set a variable in zed.rc to something, un-commenting it in the process.
 #
 # $1 variable
 # $2 value
 function zed_rc_set
 {
 	var="$1"
 	val="$2"
 	# Remove the line
 	cmd="'/$var/d'"
 	eval sed -i $cmd $ZEDLET_DIR/zed.rc
 
 	# Add it at the end
 	echo "$var=$val" >> $ZEDLET_DIR/zed.rc
 }
 
 
 #
 # Check is provided device is being active used as a swap device.
 #
 function is_swap_inuse
 {
 	typeset device=$1
 
 	if [[ -z $device ]] ; then
 		log_note "No device specified."
 		return 1
 	fi
 
 	case "$UNAME" in
 	Linux)
 		swapon -s | grep -wq $(readlink -f $device)
 		;;
 	FreeBSD)
 		swapctl -l | grep -wq $device
 		;;
 	*)
 		swap -l | grep -wq $device
 		;;
 	esac
 }
 
 #
 # Setup a swap device using the provided device.
 #
 function swap_setup
 {
 	typeset swapdev=$1
 
 	case "$UNAME" in
 	Linux)
 		log_must eval "mkswap $swapdev > /dev/null 2>&1"
 		log_must swapon $swapdev
 		;;
 	FreeBSD)
 		log_must swapctl -a $swapdev
 		;;
 	*)
     log_must swap -a $swapdev
 		;;
 	esac
 
 	return 0
 }
 
 #
 # Cleanup a swap device on the provided device.
 #
 function swap_cleanup
 {
 	typeset swapdev=$1
 
 	if is_swap_inuse $swapdev; then
 		if is_linux; then
 			log_must swapoff $swapdev
 		elif is_freebsd; then
 			log_must swapoff $swapdev
 		else
 			log_must swap -d $swapdev
 		fi
 	fi
 
 	return 0
 }
 
 #
 # Set a global system tunable (64-bit value)
 #
 # $1 tunable name (use a NAME defined in tunables.cfg)
 # $2 tunable values
 #
 function set_tunable64
 {
 	set_tunable_impl "$1" "$2" Z
 }
 
 #
 # Set a global system tunable (32-bit value)
 #
 # $1 tunable name (use a NAME defined in tunables.cfg)
 # $2 tunable values
 #
 function set_tunable32
 {
 	set_tunable_impl "$1" "$2" W
 }
 
 function set_tunable_impl
 {
 	typeset name="$1"
 	typeset value="$2"
 	typeset mdb_cmd="$3"
 
 	eval "typeset tunable=\$$name"
 	case "$tunable" in
 	UNSUPPORTED)
 		log_unsupported "Tunable '$name' is unsupported on $UNAME"
 		;;
 	"")
 		log_fail "Tunable '$name' must be added to tunables.cfg"
 		;;
 	*)
 		;;
 	esac
 
 	[[ -z "$value" ]] && return 1
 	[[ -z "$mdb_cmd" ]] && return 1
 
 	case "$UNAME" in
 	Linux)
 		typeset zfs_tunables="/sys/module/zfs/parameters"
 		echo "$value" >"$zfs_tunables/$tunable"
 		;;
 	FreeBSD)
 		sysctl vfs.zfs.$tunable=$value
 		;;
 	SunOS)
 		echo "${tunable}/${mdb_cmd}0t${value}" | mdb -kw
 		;;
 	esac
 }
 
 function save_tunable
 {
 	[[ ! -d $TEST_BASE_DIR ]] && return 1
 	[[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2
 	echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1"
 }
 
 function restore_tunable
 {
 	[[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1
 	val="$(cat $TEST_BASE_DIR/tunable-"""$1""")"
 	set_tunable64 "$1" "$val"
 	rm $TEST_BASE_DIR/tunable-$1
 }
 
 #
 # Get a global system tunable
 #
 # $1 tunable name (use a NAME defined in tunables.cfg)
 #
 function get_tunable
 {
 	get_tunable_impl "$1"
 }
 
 function get_tunable_impl
 {
 	typeset name="$1"
 	typeset module="${2:-zfs}"
 	typeset check_only="$3"
 
 	eval "typeset tunable=\$$name"
 	case "$tunable" in
 	UNSUPPORTED)
 		if [ -z "$check_only" ] ; then
 			log_unsupported "Tunable '$name' is unsupported on $UNAME"
 		else
 			return 1
 		fi
 		;;
 	"")
 		if [ -z "$check_only" ] ; then
 			log_fail "Tunable '$name' must be added to tunables.cfg"
 		else
 			return 1
 		fi
 		;;
 	*)
 		;;
 	esac
 
 	case "$UNAME" in
 	Linux)
 		typeset zfs_tunables="/sys/module/$module/parameters"
 		cat $zfs_tunables/$tunable
 		;;
 	FreeBSD)
 		sysctl -n vfs.zfs.$tunable
 		;;
 	SunOS)
 		[[ "$module" -eq "zfs" ]] || return 1
 		;;
 	esac
 }
 
 # Does a tunable exist?
 #
 # $1: Tunable name
 function tunable_exists
 {
 	get_tunable_impl $1 "zfs" 1
 }
 
 #
 # Compute MD5 digest for given file or stdin if no file given.
 # Note: file path must not contain spaces
 #
 function md5digest
 {
 	typeset file=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		md5 -q $file
 		;;
 	*)
 		typeset sum _
 		read -r sum _ < <(md5sum -b $file)
 		echo $sum
 		;;
 	esac
 }
 
 #
 # Compute SHA256 digest for given file or stdin if no file given.
 # Note: file path must not contain spaces
 #
 function sha256digest
 {
 	typeset file=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		sha256 -q $file
 		;;
 	*)
 		typeset sum _
 		read -r sum _ < <(sha256sum -b $file)
 		echo $sum
 		;;
 	esac
 }
 
 function new_fs #<args>
 {
 	case "$UNAME" in
 	FreeBSD)
 		newfs "$@"
 		;;
 	*)
 		echo y | newfs -v "$@"
 		;;
 	esac
 }
 
 function stat_size #<path>
 {
 	typeset path=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		stat -f %z "$path"
 		;;
 	*)
 		stat -c %s "$path"
 		;;
 	esac
 }
 
 function stat_mtime #<path>
 {
 	typeset path=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		stat -f %m "$path"
 		;;
 	*)
 		stat -c %Y "$path"
 		;;
 	esac
 }
 
 function stat_ctime #<path>
 {
 	typeset path=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		stat -f %c "$path"
 		;;
 	*)
 		stat -c %Z "$path"
 		;;
 	esac
 }
 
 function stat_crtime #<path>
 {
 	typeset path=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		stat -f %B "$path"
 		;;
 	*)
 		stat -c %W "$path"
 		;;
 	esac
 }
 
 function stat_generation #<path>
 {
 	typeset path=$1
 
 	case "$UNAME" in
 	Linux)
 		getversion "${path}"
 		;;
 	*)
 		stat -f %v "${path}"
 		;;
 	esac
 }
 
 # Run a command as if it was being run in a TTY.
 #
 # Usage:
 #
 #    faketty command
 #
 function faketty
 {
     if is_freebsd; then
         script -q /dev/null env "$@"
     else
         script --return --quiet -c "$*" /dev/null
     fi
 }
 
 #
 # Produce a random permutation of the integers in a given range (inclusive).
 #
 function range_shuffle # begin end
 {
 	typeset -i begin=$1
 	typeset -i end=$2
 
 	seq ${begin} ${end} | sort -R
 }
 
 #
 # Cross-platform xattr helpers
 #
 
 function get_xattr # name path
 {
 	typeset name=$1
 	typeset path=$2
 
 	case "$UNAME" in
 	FreeBSD)
 		getextattr -qq user "${name}" "${path}"
 		;;
 	*)
 		attr -qg "${name}" "${path}"
 		;;
 	esac
 }
 
 function set_xattr # name value path
 {
 	typeset name=$1
 	typeset value=$2
 	typeset path=$3
 
 	case "$UNAME" in
 	FreeBSD)
 		setextattr user "${name}" "${value}" "${path}"
 		;;
 	*)
 		attr -qs "${name}" -V "${value}" "${path}"
 		;;
 	esac
 }
 
 function set_xattr_stdin # name value
 {
 	typeset name=$1
 	typeset path=$2
 
 	case "$UNAME" in
 	FreeBSD)
 		setextattr -i user "${name}" "${path}"
 		;;
 	*)
 		attr -qs "${name}" "${path}"
 		;;
 	esac
 }
 
 function rm_xattr # name path
 {
 	typeset name=$1
 	typeset path=$2
 
 	case "$UNAME" in
 	FreeBSD)
 		rmextattr -q user "${name}" "${path}"
 		;;
 	*)
 		attr -qr "${name}" "${path}"
 		;;
 	esac
 }
 
 function ls_xattr # path
 {
 	typeset path=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		lsextattr -qq user "${path}"
 		;;
 	*)
 		attr -ql "${path}"
 		;;
 	esac
 }
 
 function kstat # stat flags?
 {
 	typeset stat=$1
 	typeset flags=${2-"-n"}
 
 	case "$UNAME" in
 	FreeBSD)
 		sysctl $flags kstat.zfs.misc.$stat
 		;;
 	Linux)
 		cat "/proc/spl/kstat/zfs/$stat" 2>/dev/null
 		;;
 	*)
 		false
 		;;
 	esac
 }
 
 function get_arcstat # stat
 {
 	typeset stat=$1
 
 	case "$UNAME" in
 	FreeBSD)
 		kstat arcstats.$stat
 		;;
 	Linux)
 		kstat arcstats | awk "/$stat/"' { print $3 }'
 		;;
 	*)
 		false
 		;;
 	esac
 }
 
 function punch_hole # offset length file
 {
 	typeset offset=$1
 	typeset length=$2
 	typeset file=$3
 
 	case "$UNAME" in
 	FreeBSD)
 		truncate -d -o $offset -l $length "$file"
 		;;
 	Linux)
 		fallocate --punch-hole --offset $offset --length $length "$file"
 		;;
 	*)
 		false
 		;;
 	esac
 }
 
 function zero_range # offset length file
 {
 	typeset offset=$1
 	typeset length=$2
 	typeset file=$3
 
 	case "$UNAME" in
 	Linux)
 		fallocate --zero-range --offset $offset --length $length "$file"
 		;;
 	*)
 		false
 		;;
 	esac
 }
 
 #
 # Wait for the specified arcstat to reach non-zero quiescence.
 # If echo is 1 echo the value after reaching quiescence, otherwise
 # if echo is 0 print the arcstat we are waiting on.
 #
 function arcstat_quiescence # stat echo
 {
 	typeset stat=$1
 	typeset echo=$2
 	typeset do_once=true
 
 	if [[ $echo -eq 0 ]]; then
 		echo "Waiting for arcstat $1 quiescence."
 	fi
 
 	while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do
 		typeset stat1=$(get_arcstat $stat)
 		sleep 0.5
 		typeset stat2=$(get_arcstat $stat)
 		do_once=false
 	done
 
 	if [[ $echo -eq 1 ]]; then
 		echo $stat2
 	fi
 }
 
 function arcstat_quiescence_noecho # stat
 {
 	typeset stat=$1
 	arcstat_quiescence $stat 0
 }
 
 function arcstat_quiescence_echo # stat
 {
 	typeset stat=$1
 	arcstat_quiescence $stat 1
 }
 
 #
 # Given an array of pids, wait until all processes
 # have completed and check their return status.
 #
 function wait_for_children #children
 {
 	rv=0
 	children=("$@")
 	for child in "${children[@]}"
 	do
 		child_exit=0
 		wait ${child} || child_exit=$?
 		if [ $child_exit -ne 0 ]; then
 			echo "child ${child} failed with ${child_exit}"
 			rv=1
 		fi
 	done
 	return $rv
 }
 
 #
 # Compare two directory trees recursively in a manner similar to diff(1), but
 # using rsync. If there are any discrepancies, a summary of the differences are
 # output and a non-zero error is returned.
 #
 # If you're comparing a directory after a ZIL replay, you should set
 # LIBTEST_DIFF_ZIL_REPLAY=1 or use replay_directory_diff which will cause
 # directory_diff to ignore mtime changes (the ZIL replay won't fix up mtime
 # information).
 #
 function directory_diff # dir_a dir_b
 {
 	dir_a="$1"
 	dir_b="$2"
 	zil_replay="${LIBTEST_DIFF_ZIL_REPLAY:-0}"
 
 	# If one of the directories doesn't exist, return 2. This is to match the
 	# semantics of diff.
 	if ! [ -d "$dir_a" -a -d "$dir_b" ]; then
 		return 2
 	fi
 
 	# Run rsync with --dry-run --itemize-changes to get something akin to diff
 	# output, but rsync is far more thorough in detecting differences (diff
 	# doesn't compare file metadata, and cannot handle special files).
 	#
 	# Also make sure to filter out non-user.* xattrs when comparing. On
 	# SELinux-enabled systems the copied tree will probably have different
 	# SELinux labels.
 	args=("-nicaAHX" '--filter=-x! user.*' "--delete")
 
 	# NOTE: Quite a few rsync builds do not support --crtimes which would be
 	# necessary to verify that creation times are being maintained properly.
 	# Unfortunately because of this we cannot use it unconditionally but we can
 	# check if this rsync build supports it and use it then. This check is
 	# based on the same check in the rsync test suite (testsuite/crtimes.test).
 	#
 	# We check ctimes even with zil_replay=1 because the ZIL does store
 	# creation times and we should make sure they match (if the creation times
 	# do not match there is a "c" entry in one of the columns).
 	if rsync --version | grep -q "[, ] crtimes"; then
 		args+=("--crtimes")
 	else
 		log_note "This rsync package does not support --crtimes (-N)."
 	fi
 
 	# If we are testing a ZIL replay, we need to ignore timestamp changes.
 	# Unfortunately --no-times doesn't do what we want -- it will still tell
 	# you if the timestamps don't match but rsync will set the timestamps to
 	# the current time (leading to an itemised change entry). It's simpler to
 	# just filter out those lines.
 	if [ "$zil_replay" -eq 0 ]; then
 		filter=("cat")
 	else
 		# Different rsync versions have different numbers of columns. So just
 		# require that aside from the first two, all other columns must be
 		# blank (literal ".") or a timestamp field ("[tT]").
 		filter=("grep" "-v" '^\..[.Tt]\+ ')
 	fi
 
 	diff="$(rsync "${args[@]}" "$dir_a/" "$dir_b/" | "${filter[@]}")"
 	rv=0
 	if [ -n "$diff" ]; then
 		echo "$diff"
 		rv=1
 	fi
 	return $rv
 }
 
 #
 # Compare two directory trees recursively, without checking whether the mtimes
 # match (creation times will be checked if the available rsync binary supports
 # it). This is necessary for ZIL replay checks (because the ZIL does not
 # contain mtimes and thus after a ZIL replay, mtimes won't match).
 #
 # This is shorthand for LIBTEST_DIFF_ZIL_REPLAY=1 directory_diff <...>.
 #
 function replay_directory_diff # dir_a dir_b
 {
 	LIBTEST_DIFF_ZIL_REPLAY=1 directory_diff "$@"
 }
 
 #
 # Put coredumps into $1/core.{basename}
 #
 # Output must be saved and passed to pop_coredump_pattern on cleanup
 #
 function push_coredump_pattern # dir
 {
 	ulimit -c unlimited
 	case "$UNAME" in
 	Linux)
 		cat /proc/sys/kernel/core_pattern /proc/sys/kernel/core_uses_pid
 		echo "$1/core.%e" >/proc/sys/kernel/core_pattern &&
 		    echo 0 >/proc/sys/kernel/core_uses_pid
 		;;
 	FreeBSD)
 		sysctl -n kern.corefile
 		sysctl kern.corefile="$1/core.%N" >/dev/null
 		;;
 	*)
 		# Nothing to output – set only for this shell
 		coreadm -p "$1/core.%f"
 		;;
 	esac
 }
 
 #
 # Put coredumps back into the default location
 #
 function pop_coredump_pattern
 {
 	[ -s "$1" ] || return 0
 	case "$UNAME" in
 	Linux)
 		typeset pat pid
 		{ read -r pat; read -r pid; } < "$1"
 		echo "$pat" >/proc/sys/kernel/core_pattern &&
 		    echo "$pid" >/proc/sys/kernel/core_uses_pid
 		;;
 	FreeBSD)
 		sysctl kern.corefile="$(<"$1")" >/dev/null
 		;;
 	esac
 }
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
index 3fb946f3de98..bcd23c9ea276 100644
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -1,1149 +1,1149 @@
 /*
  */
 
 /* zfs_config.h.  Generated from zfs_config.h.in by configure.  */
 /* zfs_config.h.in.  Generated from configure.ac by autoheader.  */
 
 /* Define to 1 if translation of program messages to the user's native
    language is requested. */
 /* #undef ENABLE_NLS */
 
 /* bio_end_io_t wants 1 arg */
 /* #undef HAVE_1ARG_BIO_END_IO_T */
 
 /* lookup_bdev() wants 1 arg */
 /* #undef HAVE_1ARG_LOOKUP_BDEV */
 
 /* submit_bio() wants 1 arg */
 /* #undef HAVE_1ARG_SUBMIT_BIO */
 
 /* bdi_setup_and_register() wants 2 args */
 /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */
 
 /* vfs_getattr wants 2 args */
 /* #undef HAVE_2ARGS_VFS_GETATTR */
 
 /* zlib_deflate_workspacesize() wants 2 args */
 /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */
 
 /* bdi_setup_and_register() wants 3 args */
 /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */
 
 /* vfs_getattr wants 3 args */
 /* #undef HAVE_3ARGS_VFS_GETATTR */
 
 /* vfs_getattr wants 4 args */
 /* #undef HAVE_4ARGS_VFS_GETATTR */
 
 /* kernel has access_ok with 'type' parameter */
 /* #undef HAVE_ACCESS_OK_TYPE */
 
 /* posix_acl has refcount_t */
 /* #undef HAVE_ACL_REFCOUNT */
 
 /* add_disk() returns int */
 /* #undef HAVE_ADD_DISK_RET */
 
 /* Define if host toolchain supports AES */
 #define HAVE_AES 1
 
 /* Define if you have [rt] */
 #define HAVE_AIO_H 1
 
 #ifdef __amd64__
 #ifndef RESCUE
 /* Define if host toolchain supports AVX */
 #define HAVE_AVX 1
 #endif
 
 /* Define if host toolchain supports AVX2 */
 #define HAVE_AVX2 1
 
 /* Define if host toolchain supports AVX512BW */
 #define HAVE_AVX512BW 1
 
 /* Define if host toolchain supports AVX512CD */
 #define HAVE_AVX512CD 1
 
 /* Define if host toolchain supports AVX512DQ */
 #define HAVE_AVX512DQ 1
 
 /* Define if host toolchain supports AVX512ER */
 #define HAVE_AVX512ER 1
 
 /* Define if host toolchain supports AVX512F */
 #define HAVE_AVX512F 1
 
 /* Define if host toolchain supports AVX512IFMA */
 #define HAVE_AVX512IFMA 1
 
 /* Define if host toolchain supports AVX512PF */
 #define HAVE_AVX512PF 1
 
 /* Define if host toolchain supports AVX512VBMI */
 #define HAVE_AVX512VBMI 1
 
 /* Define if host toolchain supports AVX512VL */
 #define HAVE_AVX512VL 1
 #endif
 
 /* bdevname() is available */
 /* #undef HAVE_BDEVNAME */
 
 /* bdev_check_media_change() exists */
 /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_63 */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_OLD */
 
 /* bdev_kobj() exists */
 /* #undef HAVE_BDEV_KOBJ */
 
 /* bdev_max_discard_sectors() is available */
 /* #undef HAVE_BDEV_MAX_DISCARD_SECTORS */
 
 /* bdev_max_secure_erase_sectors() is available */
 /* #undef HAVE_BDEV_MAX_SECURE_ERASE_SECTORS */
 
 /* block_device_operations->submit_bio() returns void */
 /* #undef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID */
 
 /* bdev_whole() is available */
 /* #undef HAVE_BDEV_WHOLE */
 
 /* bio_alloc() takes 4 arguments */
 /* #undef HAVE_BIO_ALLOC_4ARG */
 
 /* bio->bi_bdev->bd_disk exists */
 /* #undef HAVE_BIO_BDEV_DISK */
 
 /* bio->bi_opf is defined */
 /* #undef HAVE_BIO_BI_OPF */
 
 /* bio->bi_status exists */
 /* #undef HAVE_BIO_BI_STATUS */
 
 /* bio has bi_iter */
 /* #undef HAVE_BIO_BVEC_ITER */
 
 /* bio_*_io_acct() available */
 /* #undef HAVE_BIO_IO_ACCT */
 
 /* bio_max_segs() is implemented */
 /* #undef HAVE_BIO_MAX_SEGS */
 
 /* bio_set_dev() is available */
 /* #undef HAVE_BIO_SET_DEV */
 
 /* bio_set_dev() GPL-only */
 /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */
 
 /* bio_set_dev() is a macro */
 /* #undef HAVE_BIO_SET_DEV_MACRO */
 
 /* bio_set_op_attrs is available */
 /* #undef HAVE_BIO_SET_OP_ATTRS */
 
 /* blkdev_get_by_path() exists and takes 4 args */
 /* #undef HAVE_BLKDEV_GET_BY_PATH_4ARG */
 
 /* blkdev_get_by_path() handles ERESTARTSYS */
 /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */
 
 /* blkdev_issue_discard() is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD */
 
 /* blkdev_issue_secure_erase() is available */
 /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */
 
 /* blkdev_put() accepts void* as arg 2 */
 /* #undef HAVE_BLKDEV_PUT_HOLDER */
 
 /* blkdev_reread_part() exists */
 /* #undef HAVE_BLKDEV_REREAD_PART */
 
 /* blkg_tryget() is available */
 /* #undef HAVE_BLKG_TRYGET */
 
 /* blkg_tryget() GPL-only */
 /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */
 
 /* blk_alloc_disk() exists */
 /* #undef HAVE_BLK_ALLOC_DISK */
 
 /* blk_alloc_queue() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */
 
 /* blk_alloc_queue_rh() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */
 
 /* blk_cleanup_disk() exists */
 /* #undef HAVE_BLK_CLEANUP_DISK */
 
 /* blk_mode_t is defined */
 /* #undef HAVE_BLK_MODE_T */
 
 /* block multiqueue is available */
 /* #undef HAVE_BLK_MQ */
 
 /* blk queue backing_dev_info is dynamic */
 /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */
 
 /* blk_queue_discard() is available */
 /* #undef HAVE_BLK_QUEUE_DISCARD */
 
 /* blk_queue_flag_clear() exists */
 /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */
 
 /* blk_queue_flag_set() exists */
 /* #undef HAVE_BLK_QUEUE_FLAG_SET */
 
 /* blk_queue_flush() is available */
 /* #undef HAVE_BLK_QUEUE_FLUSH */
 
 /* blk_queue_flush() is GPL-only */
 /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */
 
 /* blk_queue_secdiscard() is available */
 /* #undef HAVE_BLK_QUEUE_SECDISCARD */
 
 /* blk_queue_secure_erase() is available */
 /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */
 
 /* blk_queue_update_readahead() exists */
 /* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */
 
 /* blk_queue_write_cache() exists */
 /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */
 
 /* blk_queue_write_cache() is GPL-only */
 /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */
 
 /* BLK_STS_RESV_CONFLICT is defined */
 /* #undef HAVE_BLK_STS_RESV_CONFLICT */
 
 /* Define if release() in block_device_operations takes 1 arg */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG */
 
 /* Define if revalidate_disk() in block_device_operations */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */
 
 /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the
    CoreFoundation framework. */
 /* #undef HAVE_CFLOCALECOPYCURRENT */
 
 /* Define to 1 if you have the Mac OS X function
    CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */
 /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */
 
 /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in
    the CoreFoundation framework. */
 /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */
 
 /* check_disk_change() exists */
 /* #undef HAVE_CHECK_DISK_CHANGE */
 
 /* clear_inode() is available */
 /* #undef HAVE_CLEAR_INODE */
 
 /* dentry uses const struct dentry_operations */
 /* #undef HAVE_CONST_DENTRY_OPERATIONS */
 
 /* copy_from_iter() is available */
 /* #undef HAVE_COPY_FROM_ITER */
 
 /* copy_splice_read exists */
 /* #undef HAVE_COPY_SPLICE_READ */
 
 /* copy_to_iter() is available */
 /* #undef HAVE_COPY_TO_ITER */
 
 /* cpu_has_feature() is GPL-only */
 /* #undef HAVE_CPU_HAS_FEATURE_GPL_ONLY */
 
 /* yes */
 /* #undef HAVE_CPU_HOTPLUG */
 
 /* current_time() exists */
 /* #undef HAVE_CURRENT_TIME */
 
 /* Define if the GNU dcgettext() function is already present or preinstalled.
    */
 /* #undef HAVE_DCGETTEXT */
 
 /* DECLARE_EVENT_CLASS() is available */
 /* #undef HAVE_DECLARE_EVENT_CLASS */
 
 /* dentry aliases are in d_u member */
 /* #undef HAVE_DENTRY_D_U_ALIASES */
 
 /* dequeue_signal() takes 4 arguments */
 /* #undef HAVE_DEQUEUE_SIGNAL_4ARG */
 
 /* lookup_bdev() wants dev_t arg */
 /* #undef HAVE_DEVT_LOOKUP_BDEV */
 
 /* sops->dirty_inode() wants flags */
 /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */
 
 /* disk_check_media_change() exists */
 /* #undef HAVE_DISK_CHECK_MEDIA_CHANGE */
 
 /* disk_*_io_acct() available */
 /* #undef HAVE_DISK_IO_ACCT */
 
 /* disk_update_readahead() exists */
 /* #undef HAVE_DISK_UPDATE_READAHEAD */
 
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
 
 /* d_make_root() is available */
 /* #undef HAVE_D_MAKE_ROOT */
 
 /* d_prune_aliases() is available */
 /* #undef HAVE_D_PRUNE_ALIASES */
 
 /* dops->d_revalidate() operation takes nameidata */
 /* #undef HAVE_D_REVALIDATE_NAMEIDATA */
 
 /* eops->encode_fh() wants child and parent inodes */
 /* #undef HAVE_ENCODE_FH_WITH_INODE */
 
 /* sops->evict_inode() exists */
 /* #undef HAVE_EVICT_INODE */
 
 /* FALLOC_FL_ZERO_RANGE is defined */
 /* #undef HAVE_FALLOC_FL_ZERO_RANGE */
 
 /* fault_in_iov_iter_readable() is available */
 /* #undef HAVE_FAULT_IN_IOV_ITER_READABLE */
 
 /* filemap_range_has_page() is available */
 /* #undef HAVE_FILEMAP_RANGE_HAS_PAGE */
 
 /* fops->aio_fsync() exists */
 /* #undef HAVE_FILE_AIO_FSYNC */
 
 /* file_dentry() is available */
 /* #undef HAVE_FILE_DENTRY */
 
 /* fops->fadvise() exists */
 /* #undef HAVE_FILE_FADVISE */
 
 /* file_inode() is available */
 /* #undef HAVE_FILE_INODE */
 
 /* flush_dcache_page() is GPL-only */
 /* #undef HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY */
 
 /* iops->follow_link() cookie */
 /* #undef HAVE_FOLLOW_LINK_COOKIE */
 
 /* iops->follow_link() nameidata */
 /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */
 
 /* Define if compiler supports -Wformat-overflow */
 /* #undef HAVE_FORMAT_OVERFLOW */
 
 /* fsync_bdev() is declared in include/blkdev.h */
 /* #undef HAVE_FSYNC_BDEV */
 
 /* fops->fsync() with range */
 /* #undef HAVE_FSYNC_RANGE */
 
 /* fops->fsync() without dentry */
 /* #undef HAVE_FSYNC_WITHOUT_DENTRY */
 
 /* yes */
 /* #undef HAVE_GENERIC_FADVISE */
 
 /* generic_fillattr requires struct mnt_idmap* */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP */
 
 /* generic_fillattr requires struct mnt_idmap* and u32 request_mask */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK */
 
 /* generic_fillattr requires struct user_namespace* */
 /* #undef HAVE_GENERIC_FILLATTR_USERNS */
 
 /* generic_*_io_acct() 3 arg available */
 /* #undef HAVE_GENERIC_IO_ACCT_3ARG */
 
 /* generic_*_io_acct() 4 arg available */
 /* #undef HAVE_GENERIC_IO_ACCT_4ARG */
 
 /* generic_readlink is global */
 /* #undef HAVE_GENERIC_READLINK */
 
 /* generic_setxattr() exists */
 /* #undef HAVE_GENERIC_SETXATTR */
 
 /* generic_write_checks() takes kiocb */
 /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */
 
 /* Define if the GNU gettext() function is already present or preinstalled. */
 /* #undef HAVE_GETTEXT */
 
 /* iops->get_acl() exists */
 /* #undef HAVE_GET_ACL */
 
 /* iops->get_acl() takes rcu */
 /* #undef HAVE_GET_ACL_RCU */
 
 /* has iops->get_inode_acl() */
 /* #undef HAVE_GET_INODE_ACL */
 
 /* iops->get_link() cookie */
 /* #undef HAVE_GET_LINK_COOKIE */
 
 /* iops->get_link() delayed */
 /* #undef HAVE_GET_LINK_DELAYED */
 
 /* group_info->gid exists */
 /* #undef HAVE_GROUP_INFO_GID */
 
 /* has_capability() is available */
 /* #undef HAVE_HAS_CAPABILITY */
 
 /* iattr->ia_vfsuid and iattr->ia_vfsgid exist */
 /* #undef HAVE_IATTR_VFSID */
 
 /* Define if you have the iconv() function and it works. */
 #define HAVE_ICONV 1
 
 /* iops->getattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_GETATTR */
 
 /* iops->setattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_SETATTR */
 
 /* APIs for idmapped mount are present */
 /* #undef HAVE_IDMAP_MNT_API */
 
 /* Define if compiler supports -Wimplicit-fallthrough */
 /* #undef HAVE_IMPLICIT_FALLTHROUGH */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_INFINITE_RECURSION */
 
 /* inode_get_ctime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_CTIME */
 
 /* yes */
 /* #undef HAVE_INODE_LOCK_SHARED */
 
 /* inode_owner_or_capable() exists */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE */
 
 /* inode_owner_or_capable() takes mnt_idmap */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAP */
 
 /* inode_owner_or_capable() takes user_ns */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_USERNS */
 
 /* inode_set_ctime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_CTIME_TO_TS */
 
 /* inode_set_flags() exists */
 /* #undef HAVE_INODE_SET_FLAGS */
 
 /* inode_set_iversion() exists */
 /* #undef HAVE_INODE_SET_IVERSION */
 
 /* inode->i_*time's are timespec64 */
 /* #undef HAVE_INODE_TIMESPEC64_TIMES */
 
 /* timestamp_truncate() exists */
 /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
 /* in_compat_syscall() is available */
 /* #undef HAVE_IN_COMPAT_SYSCALL */
 
 /* iops->create() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_CREATE_IDMAP */
 
 /* iops->create() takes struct user_namespace* */
 /* #undef HAVE_IOPS_CREATE_USERNS */
 
 /* iops->mkdir() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKDIR_IDMAP */
 
 /* iops->mkdir() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKDIR_USERNS */
 
 /* iops->mknod() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKNOD_IDMAP */
 
 /* iops->mknod() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKNOD_USERNS */
 
 /* iops->permission() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_PERMISSION_IDMAP */
 
 /* iops->permission() takes struct user_namespace* */
 /* #undef HAVE_IOPS_PERMISSION_USERNS */
 
 /* iops->rename() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_RENAME_IDMAP */
 
 /* iops->rename() takes struct user_namespace* */
 /* #undef HAVE_IOPS_RENAME_USERNS */
 
 /* iops->setattr() exists */
 /* #undef HAVE_IOPS_SETATTR */
 
 /* iops->symlink() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_SYMLINK_IDMAP */
 
 /* iops->symlink() takes struct user_namespace* */
 /* #undef HAVE_IOPS_SYMLINK_USERNS */
 
 /* iov_iter_advance() is available */
 /* #undef HAVE_IOV_ITER_ADVANCE */
 
 /* iov_iter_count() is available */
 /* #undef HAVE_IOV_ITER_COUNT */
 
 /* iov_iter_fault_in_readable() is available */
 /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */
 
 /* iov_iter_revert() is available */
 /* #undef HAVE_IOV_ITER_REVERT */
 
 /* iov_iter_type() is available */
 /* #undef HAVE_IOV_ITER_TYPE */
 
 /* iov_iter types are available */
 /* #undef HAVE_IOV_ITER_TYPES */
 
 /* yes */
 /* #undef HAVE_IO_SCHEDULE_TIMEOUT */
 
 /* Define to 1 if you have the `issetugid' function. */
 #define HAVE_ISSETUGID 1
 
 /* iter_iov() is available */
 /* #undef HAVE_ITER_IOV */
 
 /* kernel has kernel_fpu_* functions */
 /* #undef HAVE_KERNEL_FPU */
 
 /* kernel has asm/fpu/api.h */
 /* #undef HAVE_KERNEL_FPU_API_HEADER */
 
 /* kernel fpu internal */
 /* #undef HAVE_KERNEL_FPU_INTERNAL */
 
 /* kernel has asm/fpu/internal.h */
 /* #undef HAVE_KERNEL_FPU_INTERNAL_HEADER */
 
 /* uncached_acl_sentinel() exists */
 /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_KERNEL_INFINITE_RECURSION */
 
 /* kernel does stack verification */
 /* #undef HAVE_KERNEL_OBJTOOL */
 
 /* kernel has linux/objtool.h */
 /* #undef HAVE_KERNEL_OBJTOOL_HEADER */
 
 /* kernel_read() take loff_t pointer */
 /* #undef HAVE_KERNEL_READ_PPOS */
 
 /* timer_list.function gets a timer_list */
 /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */
 
 /* struct timer_list has a flags member */
 /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */
 
 /* timer_setup() is available */
 /* #undef HAVE_KERNEL_TIMER_SETUP */
 
 /* kernel_write() take loff_t pointer */
 /* #undef HAVE_KERNEL_WRITE_PPOS */
 
 /* kmem_cache_create_usercopy() exists */
 /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */
 
 /* kstrtoul() exists */
 /* #undef HAVE_KSTRTOUL */
 
 /* ktime_get_coarse_real_ts64() exists */
 /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */
 
 /* ktime_get_raw_ts64() exists */
 /* #undef HAVE_KTIME_GET_RAW_TS64 */
 
 /* kvmalloc exists */
 /* #undef HAVE_KVMALLOC */
 
 /* Define if you have [aio] */
 /* #undef HAVE_LIBAIO */
 
 /* Define if you have [blkid] */
 /* #undef HAVE_LIBBLKID */
 
 /* Define if you have [crypto] */
 #define HAVE_LIBCRYPTO 1
 
 /* Define if you have [tirpc] */
 /* #undef HAVE_LIBTIRPC */
 
 /* Define if you have [udev] */
 /* #undef HAVE_LIBUDEV */
 
 /* Define if you have [uuid] */
 /* #undef HAVE_LIBUUID */
 
 /* linux/blk-cgroup.h exists */
 /* #undef HAVE_LINUX_BLK_CGROUP_HEADER */
 
 /* lseek_execute() is available */
 /* #undef HAVE_LSEEK_EXECUTE */
 
 /* makedev() is declared in sys/mkdev.h */
 /* #undef HAVE_MAKEDEV_IN_MKDEV */
 
 /* makedev() is declared in sys/sysmacros.h */
 /* #undef HAVE_MAKEDEV_IN_SYSMACROS */
 
 /* Noting that make_request_fn() returns blk_qc_t */
 /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */
 
 /* Noting that make_request_fn() returns void */
 /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */
 
 /* iops->mkdir() takes umode_t */
 /* #undef HAVE_MKDIR_UMODE_T */
 
 /* Define to 1 if you have the `mlockall' function. */
 #define HAVE_MLOCKALL 1
 
 /* lookup_bdev() wants mode arg */
 /* #undef HAVE_MODE_LOOKUP_BDEV */
 
 /* Define if host toolchain supports MOVBE */
 #define HAVE_MOVBE 1
 
 /* new_sync_read()/new_sync_write() are available */
 /* #undef HAVE_NEW_SYNC_READ */
 
 /* folio_wait_bit() exists */
 /* #undef HAVE_PAGEMAP_FOLIO_WAIT_BIT */
 
 /* part_to_dev() exists */
 /* #undef HAVE_PART_TO_DEV */
 
 /* iops->getattr() takes a path */
 /* #undef HAVE_PATH_IOPS_GETATTR */
 
 /* Define if host toolchain supports PCLMULQDQ */
 #define HAVE_PCLMULQDQ 1
 
 /* percpu_counter_add_batch() is defined */
 /* #undef HAVE_PERCPU_COUNTER_ADD_BATCH */
 
 /* percpu_counter_init() wants gfp_t */
 /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */
 
 /* posix_acl_chmod() exists */
 /* #undef HAVE_POSIX_ACL_CHMOD */
 
 /* posix_acl_from_xattr() needs user_ns */
 /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */
 
 /* posix_acl_release() is available */
 /* #undef HAVE_POSIX_ACL_RELEASE */
 
 /* posix_acl_release() is GPL-only */
 /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */
 
 /* posix_acl_valid() wants user namespace */
 /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */
 
 /* proc_ops structure exists */
 /* #undef HAVE_PROC_OPS_STRUCT */
 
 /* iops->put_link() cookie */
 /* #undef HAVE_PUT_LINK_COOKIE */
 
 /* iops->put_link() delayed */
 /* #undef HAVE_PUT_LINK_DELAYED */
 
 /* iops->put_link() nameidata */
 /* #undef HAVE_PUT_LINK_NAMEIDATA */
 
 /* If available, contains the Python version number currently in use. */
 #define HAVE_PYTHON "3.7"
 
 /* qat is enabled and existed */
 /* #undef HAVE_QAT */
 
 /* struct reclaim_state has reclaimed */
 /* #undef HAVE_RECLAIM_STATE_RECLAIMED */
 
 /* register_shrinker is vararg */
 /* #undef HAVE_REGISTER_SHRINKER_VARARG */
 
 /* register_sysctl_table exists */
 /* #undef HAVE_REGISTER_SYSCTL_TABLE */
 
 /* iops->rename2() exists */
 /* #undef HAVE_RENAME2 */
 
 /* struct inode_operations_wrapper takes .rename2() */
 /* #undef HAVE_RENAME2_OPERATIONS_WRAPPER */
 
 /* iops->rename() wants flags */
 /* #undef HAVE_RENAME_WANTS_FLAGS */
 
 /* REQ_DISCARD is defined */
 /* #undef HAVE_REQ_DISCARD */
 
 /* REQ_FLUSH is defined */
 /* #undef HAVE_REQ_FLUSH */
 
 /* REQ_OP_DISCARD is defined */
 /* #undef HAVE_REQ_OP_DISCARD */
 
 /* REQ_OP_FLUSH is defined */
 /* #undef HAVE_REQ_OP_FLUSH */
 
 /* REQ_OP_SECURE_ERASE is defined */
 /* #undef HAVE_REQ_OP_SECURE_ERASE */
 
 /* REQ_PREFLUSH is defined */
 /* #undef HAVE_REQ_PREFLUSH */
 
 /* revalidate_disk() is available */
 /* #undef HAVE_REVALIDATE_DISK */
 
 /* revalidate_disk_size() is available */
 /* #undef HAVE_REVALIDATE_DISK_SIZE */
 
 /* struct rw_semaphore has member activity */
 /* #undef HAVE_RWSEM_ACTIVITY */
 
 /* struct rw_semaphore has atomic_long_t member count */
 /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */
 
 /* linux/sched/signal.h exists */
 /* #undef HAVE_SCHED_SIGNAL_HEADER */
 
 /* Define to 1 if you have the <security/pam_modules.h> header file. */
 #define HAVE_SECURITY_PAM_MODULES_H 1
 
 /* setattr_prepare() accepts mnt_idmap */
 /* #undef HAVE_SETATTR_PREPARE_IDMAP */
 
 /* setattr_prepare() is available, doesn't accept user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */
 
 /* setattr_prepare() accepts user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_USERNS */
 
 /* iops->set_acl() exists, takes 3 args */
 /* #undef HAVE_SET_ACL */
 
 /* iops->set_acl() takes 4 args, arg1 is struct mnt_idmap * */
 /* #undef HAVE_SET_ACL_IDMAP_DENTRY */
 
 /* iops->set_acl() takes 4 args */
 /* #undef HAVE_SET_ACL_USERNS */
 
 /* iops->set_acl() takes 4 args, arg2 is struct dentry * */
 /* #undef HAVE_SET_ACL_USERNS_DENTRY_ARG2 */
 
 /* set_cached_acl() is usable */
 /* #undef HAVE_SET_CACHED_ACL_USABLE */
 
 /* set_special_state() exists */
 /* #undef HAVE_SET_SPECIAL_STATE */
 
 /* struct shrink_control exists */
 /* #undef HAVE_SHRINK_CONTROL_STRUCT */
 
 /* kernel_siginfo_t exists */
 /* #undef HAVE_SIGINFO */
 
 /* signal_stop() exists */
 /* #undef HAVE_SIGNAL_STOP */
 
 /* new shrinker callback wants 2 args */
 /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */
 
 /* cs->count_objects exists */
 /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */
 
 #if defined(__amd64__) || defined(__i386__)
 /* Define if host toolchain supports SSE */
 #define HAVE_SSE 1
 
 /* Define if host toolchain supports SSE2 */
 #define HAVE_SSE2 1
 
 /* Define if host toolchain supports SSE3 */
 #define HAVE_SSE3 1
 
 /* Define if host toolchain supports SSE4.1 */
 #define HAVE_SSE4_1 1
 
 /* Define if host toolchain supports SSE4.2 */
 #define HAVE_SSE4_2 1
 
 /* Define if host toolchain supports SSSE3 */
 #define HAVE_SSSE3 1
 #endif
 
 /* STACK_FRAME_NON_STANDARD is defined */
 /* #undef HAVE_STACK_FRAME_NON_STANDARD */
 
 /* standalone <linux/stdarg.h> exists */
 /* #undef HAVE_STANDALONE_LINUX_STDARG */
 
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdio.h> header file. */
 #define HAVE_STDIO_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
 /* Define to 1 if you have the `strlcat' function. */
 #define HAVE_STRLCAT 1
 
 /* Define to 1 if you have the `strlcpy' function. */
 #define HAVE_STRLCPY 1
 
 /* submit_bio is member of struct block_device_operations */
 /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 
 /* super_setup_bdi_name() exits */
 /* #undef HAVE_SUPER_SETUP_BDI_NAME */
 
 /* super_block->s_user_ns exists */
 /* #undef HAVE_SUPER_USER_NS */
 
 /* sync_blockdev() is declared in include/blkdev.h */
 /* #undef HAVE_SYNC_BLOCKDEV */
 
 /* struct kobj_type has default_groups */
 /* #undef HAVE_SYSFS_DEFAULT_GROUPS */
 
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
 /* i_op->tmpfile() exists */
 /* #undef HAVE_TMPFILE */
 
 /* i_op->tmpfile() uses old dentry signature */
 /* #undef HAVE_TMPFILE_DENTRY */
 
 /* i_op->tmpfile() has mnt_idmap */
 /* #undef HAVE_TMPFILE_IDMAP */
 
 /* i_op->tmpfile() has userns */
 /* #undef HAVE_TMPFILE_USERNS */
 
 /* totalhigh_pages() exists */
 /* #undef HAVE_TOTALHIGH_PAGES */
 
 /* kernel has totalram_pages() */
 /* #undef HAVE_TOTALRAM_PAGES_FUNC */
 
 /* Define to 1 if you have the `udev_device_get_is_initialized' function. */
 /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */
 
 /* kernel has __kernel_fpu_* functions */
 /* #undef HAVE_UNDERSCORE_KERNEL_FPU */
 
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
 /* iops->getattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_GETATTR */
 
 /* iops->setattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_SETATTR */
 
 /* user_namespace->ns.inum exists */
 /* #undef HAVE_USER_NS_COMMON_INUM */
 
 /* iops->getattr() takes a vfsmount */
 /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */
 
 /* fops->clone_file_range() is available */
 /* #undef HAVE_VFS_CLONE_FILE_RANGE */
 
 /* fops->copy_file_range() is available */
 /* #undef HAVE_VFS_COPY_FILE_RANGE */
 
 /* fops->dedupe_file_range() is available */
 /* #undef HAVE_VFS_DEDUPE_FILE_RANGE */
 
 /* aops->direct_IO() uses iovec */
 /* #undef HAVE_VFS_DIRECT_IO_IOVEC */
 
 /* aops->direct_IO() uses iov_iter without rw */
 /* #undef HAVE_VFS_DIRECT_IO_ITER */
 
 /* aops->direct_IO() uses iov_iter with offset */
 /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */
 
 /* aops->direct_IO() uses iov_iter with rw and offset */
 /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */
 
 /* filemap_dirty_folio exists */
 /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */
 
 /* file_operations_extend takes .copy_file_range() and .clone_file_range() */
 /* #undef HAVE_VFS_FILE_OPERATIONS_EXTEND */
 
 /* generic_copy_file_range() is available */
 /* #undef HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 
 /* All required iov_iter interfaces are available */
 /* #undef HAVE_VFS_IOV_ITER */
 
 /* fops->iterate() is available */
 /* #undef HAVE_VFS_ITERATE */
 
 /* fops->iterate_shared() is available */
 /* #undef HAVE_VFS_ITERATE_SHARED */
 
 /* fops->readdir() is available */
 /* #undef HAVE_VFS_READDIR */
 
 /* address_space_operations->readpages exists */
 /* #undef HAVE_VFS_READPAGES */
 
 /* read_folio exists */
 /* #undef HAVE_VFS_READ_FOLIO */
 
 /* fops->remap_file_range() is available */
 /* #undef HAVE_VFS_REMAP_FILE_RANGE */
 
 /* fops->read/write_iter() are available */
 /* #undef HAVE_VFS_RW_ITERATE */
 
 /* __set_page_dirty_nobuffers exists */
 /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */
 
 /* __vmalloc page flags exists */
 /* #undef HAVE_VMALLOC_PAGE_KERNEL */
 
 /* yes */
 /* #undef HAVE_WAIT_ON_BIT_ACTION */
 
 /* wait_queue_entry_t exists */
 /* #undef HAVE_WAIT_QUEUE_ENTRY_T */
 
 /* wq_head->head and wq_entry->entry exist */
 /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */
 
 /* int (*writepage_t)() takes struct folio* */
 /* #undef HAVE_WRITEPAGE_T_FOLIO */
 
 /* xattr_handler->get() wants dentry */
 /* #undef HAVE_XATTR_GET_DENTRY */
 
 /* xattr_handler->get() wants both dentry and inode */
 /* #undef HAVE_XATTR_GET_DENTRY_INODE */
 
 /* xattr_handler->get() wants dentry and inode and flags */
 /* #undef HAVE_XATTR_GET_DENTRY_INODE_FLAGS */
 
 /* xattr_handler->get() wants xattr_handler */
 /* #undef HAVE_XATTR_GET_HANDLER */
 
 /* xattr_handler has name */
 /* #undef HAVE_XATTR_HANDLER_NAME */
 
 /* xattr_handler->list() wants dentry */
 /* #undef HAVE_XATTR_LIST_DENTRY */
 
 /* xattr_handler->list() wants xattr_handler */
 /* #undef HAVE_XATTR_LIST_HANDLER */
 
 /* xattr_handler->list() wants simple */
 /* #undef HAVE_XATTR_LIST_SIMPLE */
 
 /* xattr_handler->set() wants dentry */
 /* #undef HAVE_XATTR_SET_DENTRY */
 
 /* xattr_handler->set() wants both dentry and inode */
 /* #undef HAVE_XATTR_SET_DENTRY_INODE */
 
 /* xattr_handler->set() wants xattr_handler */
 /* #undef HAVE_XATTR_SET_HANDLER */
 
 /* xattr_handler->set() takes mnt_idmap */
 /* #undef HAVE_XATTR_SET_IDMAP */
 
 /* xattr_handler->set() takes user_namespace */
 /* #undef HAVE_XATTR_SET_USERNS */
 
 /* Define if host toolchain supports XSAVE */
 #define HAVE_XSAVE 1
 
 /* Define if host toolchain supports XSAVEOPT */
 #define HAVE_XSAVEOPT 1
 
 /* Define if host toolchain supports XSAVES */
 #define HAVE_XSAVES 1
 
 /* ZERO_PAGE() is GPL-only */
 /* #undef HAVE_ZERO_PAGE_GPL_ONLY */
 
 /* Define if you have [z] */
 #define HAVE_ZLIB 1
 
 /* __posix_acl_chmod() exists */
 /* #undef HAVE___POSIX_ACL_CHMOD */
 
 /* kernel exports FPU functions */
 /* #undef KERNEL_EXPORTS_X86_FPU */
 
 /* TBD: fetch(3) support */
 #if 0
 /* whether the chosen libfetch is to be loaded at run-time */
 #define LIBFETCH_DYNAMIC 1
 
 /* libfetch is fetch(3) */
 #define LIBFETCH_IS_FETCH 1
 
 /* libfetch is libcurl */
 #define LIBFETCH_IS_LIBCURL 0
 
 /* soname of chosen libfetch */
 #define LIBFETCH_SONAME "libfetch.so.6"
 #endif
 
 /* Define to the sub-directory where libtool stores uninstalled libraries. */
 #define LT_OBJDIR ".libs/"
 
 /* make_request_fn() return type */
 /* #undef MAKE_REQUEST_FN_RET */
 
 /* struct shrink_control has nid */
 /* #undef SHRINK_CONTROL_HAS_NID */
 
 /* using complete_and_exit() instead */
 /* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */
 
 /* Defined for legacy compatibility. */
 #define SPL_META_ALIAS ZFS_META_ALIAS
 
 /* Defined for legacy compatibility. */
 #define SPL_META_RELEASE ZFS_META_RELEASE
 
 /* Defined for legacy compatibility. */
 #define SPL_META_VERSION ZFS_META_VERSION
 
 /* pde_data() is PDE_DATA() */
 /* #undef SPL_PDE_DATA */
 
 /* Define to 1 if all of the C90 standard headers exist (not just the ones
    required in a freestanding environment). This macro is provided for
    backward compatibility; new code need not use it. */
 #define SYSTEM_FREEBSD 1
 
 /* True if ZFS is to be compiled for a Linux system */
 /* #undef SYSTEM_LINUX */
 
 /* Version number of package */
 /* #undef ZFS_DEBUG */
 
 /* /dev/zfs minor */
 /* #undef ZFS_DEVICE_MINOR */
 
 /* enum node_stat_item contains NR_FILE_PAGES */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */
 
 /* enum node_stat_item contains NR_INACTIVE_ANON */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */
 
 /* enum node_stat_item contains NR_INACTIVE_FILE */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */
 
 /* enum zone_stat_item contains NR_FILE_PAGES */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */
 
 /* enum zone_stat_item contains NR_INACTIVE_ANON */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */
 
 /* enum zone_stat_item contains NR_INACTIVE_FILE */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */
 
 /* GENHD_FL_EXT_DEVT flag is not available */
 /* #undef ZFS_GENHD_FL_EXT_DEVT */
 
 /* GENHD_FL_NO_PART_SCAN flag is available */
 /* #undef ZFS_GENHD_FL_NO_PART */
 
 /* global_node_page_state() exists */
 /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */
 
 /* global_zone_page_state() exists */
 /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */
 
 /* Define to 1 if GPL-only symbols can be used */
 /* #undef ZFS_IS_GPL_COMPATIBLE */
 
 /* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.2.99-211-FreeBSD_g03e9caaec"
+#define ZFS_META_ALIAS "zfs-2.2.99-217-FreeBSD_ga94860a6d"
 
 /* Define the project author. */
 #define ZFS_META_AUTHOR "OpenZFS"
 
 /* Define the project release date. */
 /* #undef ZFS_META_DATA */
 
 /* Define the maximum compatible kernel version. */
 #define ZFS_META_KVER_MAX "6.6"
 
 /* Define the minimum compatible kernel version. */
 #define ZFS_META_KVER_MIN "3.10"
 
 /* Define the project license. */
 #define ZFS_META_LICENSE "CDDL"
 
 /* Define the libtool library 'age' version information. */
 /* #undef ZFS_META_LT_AGE */
 
 /* Define the libtool library 'current' version information. */
 /* #undef ZFS_META_LT_CURRENT */
 
 /* Define the libtool library 'revision' version information. */
 /* #undef ZFS_META_LT_REVISION */
 
 /* Define the project name. */
 #define ZFS_META_NAME "zfs"
 
 /* Define the project release. */
-#define ZFS_META_RELEASE "211-FreeBSD_g03e9caaec"
+#define ZFS_META_RELEASE "217-FreeBSD_ga94860a6d"
 
 /* Define the project version. */
 #define ZFS_META_VERSION "2.2.99"
 
 /* count is located in percpu_ref.data */
 /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
index 5924053674c3..a68f618a16b9 100644
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define	ZFS_META_GITREV "zfs-2.2.99-211-g03e9caaec"
+#define	ZFS_META_GITREV "zfs-2.2.99-217-ga94860a6d"