diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile index 6de09d1b7333..63bfcb39d4d8 100644 --- a/cddl/lib/libzfs/Makefile +++ b/cddl/lib/libzfs/Makefile @@ -1,109 +1,108 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp .PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs/os/freebsd .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare .PATH: ${SRCTOP}/sys/contrib/openzfs/include .PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd .PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd/lib PACKAGE= runtime LIB= zfs LIBADD= \ avl \ bsdxml \ crypto \ geom \ m \ md \ nvpair \ pthread \ umem \ util \ uutil \ z \ zfs_core \ zutil INCS= libzfs.h USER_C = \ libzfs_changelist.c \ libzfs_config.c \ libzfs_crypto.c \ libzfs_dataset.c \ libzfs_diff.c \ libzfs_import.c \ libzfs_iter.c \ libzfs_mount.c \ libzfs_pool.c \ libzfs_sendrecv.c \ libzfs_status.c \ libzfs_util.c # FreeBSD USER_C += \ libzfs_compat.c \ - libzfs_ioctl_compat.c \ libzfs_zmount.c # libshare USER_C += \ libshare.c \ nfs.c \ os/freebsd/nfs.c \ os/freebsd/smb.c KERNEL_C = \ algs/sha2/sha2.c \ cityhash.c \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ zfs_fletcher.c \ zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ zpool_prop.c \ zprop_common.c ARCH_C = .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" ARCH_C += zfs_fletcher_intel.c \ zfs_fletcher_sse.c CFLAGS += -DHAVE_SSE2 .endif .if ${MACHINE_ARCH} == "amd64" ARCH_C += zfs_fletcher_avx512.c CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F .endif .if ${MACHINE_CPUARCH} == "aarch64" ARCH_C += zfs_fletcher_aarch64_neon.c .endif SRCS= $(USER_C) $(KERNEL_C) $(ARCH_C) WARNS?= 2 SHLIB_MAJOR= 4 CSTD= c99 CFLAGS+= -DIN_BASE CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS+= -DHAVE_ISSETUGID CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h CFLAGS+= -DSYSCONFDIR=\"/etc\" CFLAGS+= -DPKGDATADIR=\"/usr/share/zfs\" .include diff --git a/cddl/lib/libzfs_core/Makefile b/cddl/lib/libzfs_core/Makefile index 52747bfcc2d8..abe0348f2313 100644 --- a/cddl/lib/libzfs_core/Makefile +++ b/cddl/lib/libzfs_core/Makefile @@ -1,28 +1,33 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core .PATH: ${SRCTOP}/sys/contrib/openzfs/include +.PATH: ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/freebsd/zfs LIB= zfs_core LIBADD= nvpair PACKAGE= runtime INCS= libzfs_core.h -SRCS= libzfs_core.c +SRCS= libzfs_core.c \ + os/freebsd/libzfs_core_ioctl.c \ + zfs_ioctl_compat.c WARNS?= 2 CSTD= c99 CFLAGS+= -DIN_BASE CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core/common CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS+= -DHAVE_ISSETUGID CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h .include diff --git a/cddl/lib/libzutil/Makefile b/cddl/lib/libzutil/Makefile index 7aea9da14e90..85467151a5bb 100644 --- a/cddl/lib/libzutil/Makefile +++ b/cddl/lib/libzutil/Makefile @@ -1,42 +1,41 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil/os/freebsd .PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/freebsd/zfs LIB= zutil LIBADD= avl tpool PACKAGE= runtime INCS = zutil_import.h SRCS = \ zutil_device_path.c \ zutil_import.c \ zutil_import.h \ zutil_nicenum.c \ zutil_pool.c SRCS += \ - zutil_device_path_os.c \ - zutil_import_os.c \ - zutil_compat.c + os/freebsd/zutil_device_path_os.c \ + os/freebsd/zutil_import_os.c SRCS += zfs_ioctl_compat.c WARNS?= 2 CSTD= c99 CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzutil CFLAGS+= -DHAVE_ISSETUGID -DIN_BASE CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h .include diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 9776223b88d8..8dacb8082f05 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.1.99 Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 5.13 +Linux-Maximum: 5.14 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/cmd/arcstat/arcstat.in b/sys/contrib/openzfs/cmd/arcstat/arcstat.in index 9e7c52a6c7a3..cd9a803a2414 100755 --- a/sys/contrib/openzfs/cmd/arcstat/arcstat.in +++ b/sys/contrib/openzfs/cmd/arcstat/arcstat.in @@ -1,554 +1,554 @@ #!/usr/bin/env @PYTHON_SHEBANG@ # # Print out ZFS ARC Statistics exported via kstat(1) # For a definition of fields, or usage, use arcstat -v # # This script was originally a fork of the original arcstat.pl (0.1) # by Neelakanth Nadgir, originally published on his Sun blog on # 09/18/2007 # http://blogs.sun.com/realneel/entry/zfs_arc_statistics # # A new version aimed to improve upon the original by adding features # and fixing bugs as needed. This version was maintained by Mike # Harsch and was hosted in a public open source repository: # http://github.com/mharsch/arcstat # # but has since moved to the illumos-gate repository. # # This Python port was written by John Hixson for FreeNAS, introduced # in commit e2c29f: # https://github.com/freenas/freenas # # and has been improved by many people since. # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License, Version 1.0 only # (the "License"). You may not use this file except in compliance # with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Fields have a fixed width. Every interval, we fill the "v" # hash with its corresponding value (v[field]=value) using calculate(). # @hdr is the array of fields that needs to be printed, so we # just iterate over this array and print the values using our pretty printer. # # This script must remain compatible with Python 2.6+ and Python 3.4+. # import sys import time import getopt import re import copy from signal import signal, SIGINT, SIGWINCH, SIG_DFL cols = { # HDR: [Size, Scale, Description] "time": [8, -1, "Time"], "hits": [4, 1000, "ARC reads per second"], "miss": [4, 1000, "ARC misses per second"], "read": [4, 1000, "Total ARC accesses per second"], "hit%": [4, 100, "ARC hit percentage"], "miss%": [5, 100, "ARC miss percentage"], "dhit": [4, 1000, "Demand hits per second"], "dmis": [4, 1000, "Demand misses per second"], "dh%": [3, 100, "Demand hit percentage"], "dm%": [3, 100, "Demand miss percentage"], "phit": [4, 1000, "Prefetch hits per second"], "pmis": [4, 1000, "Prefetch misses per second"], "ph%": [3, 100, "Prefetch hits percentage"], "pm%": [3, 100, "Prefetch miss percentage"], "mhit": [4, 1000, "Metadata hits per second"], "mmis": [4, 1000, "Metadata misses per second"], "mread": [5, 1000, "Metadata accesses per second"], "mh%": [3, 100, "Metadata hit percentage"], "mm%": [3, 100, "Metadata miss percentage"], "arcsz": [5, 1024, "ARC size"], "size": [4, 1024, "ARC size"], "c": [4, 1024, "ARC target size"], "mfu": [4, 1000, "MFU list hits per second"], "mru": [4, 1000, "MRU list hits per second"], "mfug": [4, 1000, "MFU ghost list hits per second"], "mrug": [4, 1000, "MRU ghost list hits per second"], "eskip": [5, 1000, "evict_skip per second"], "el2skip": [7, 1000, "evict skip, due to l2 writes, per second"], "el2cach": [7, 1024, "Size of L2 cached evictions per second"], "el2el": [5, 1024, "Size of L2 eligible evictions per second"], "el2mfu": [6, 1024, "Size of L2 eligible MFU evictions per second"], "el2mru": [6, 1024, "Size of L2 eligible MRU evictions per second"], "el2inel": [7, 1024, "Size of L2 ineligible evictions per second"], "mtxmis": [6, 1000, "mutex_miss per second"], "dread": [5, 1000, "Demand accesses per second"], "pread": [5, 1000, "Prefetch accesses per second"], "l2hits": [6, 1000, "L2ARC hits per second"], "l2miss": [6, 1000, "L2ARC misses per second"], "l2read": [6, 1000, "Total L2ARC accesses per second"], "l2hit%": [6, 100, "L2ARC access hit percentage"], "l2miss%": [7, 100, "L2ARC access miss percentage"], "l2pref": [6, 1024, "L2ARC prefetch allocated size"], "l2mfu": [5, 1024, "L2ARC MFU allocated size"], "l2mru": [5, 1024, "L2ARC MRU allocated size"], "l2data": [6, 1024, "L2ARC data allocated size"], "l2meta": [6, 1024, "L2ARC metadata allocated size"], "l2pref%": [7, 100, "L2ARC prefetch percentage"], "l2mfu%": [6, 100, "L2ARC MFU percentage"], "l2mru%": [6, 100, "L2ARC MRU percentage"], "l2data%": [7, 100, "L2ARC data percentage"], "l2meta%": [7, 100, "L2ARC metadata percentage"], "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], "l2size": [6, 1024, "Size of the L2ARC"], "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], "grow": [4, 1000, "ARC grow disabled"], "need": [4, 1024, "ARC reclaim need"], "free": [4, 1024, "ARC free memory"], "avail": [5, 1024, "ARC available memory"], "waste": [5, 1024, "Wasted memory due to round up to pagesize"], } v = {} hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis", "mm%", "size", "c", "avail"] xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread", "pread", "read"] sint = 1 # Default interval is 1 second count = 1 # Default count is 1 hdr_intr = 20 # Print header every 20 lines of output opfile = None sep = " " # Default separator is 2 spaces version = "0.4" l2exist = False cmd = ("Usage: arcstat [-havxp] [-f fields] [-o file] [-s string] [interval " "[count]]\n") cur = {} d = {} out = None kstat = None pretty_print = True if sys.platform.startswith('freebsd'): # Requires py-sysctl on FreeBSD import sysctl def kstat_update(): global kstat k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats') if ctl.type != sysctl.CTLTYPE_NODE] if not k: sys.exit(1) kstat = {} for s in k: if not s: continue name, value = s.name, s.value # Trims 'kstat.zfs.misc.arcstats' from the name kstat[name[24:]] = int(value) elif sys.platform.startswith('linux'): def kstat_update(): global kstat k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] if not k: sys.exit(1) del k[0:2] kstat = {} for s in k: if not s: continue name, unused, value = s.split() kstat[name] = int(value) def detailed_usage(): sys.stderr.write("%s\n" % cmd) sys.stderr.write("Field definitions are as follows:\n") for key in cols: sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) sys.stderr.write("\n") sys.exit(0) def usage(): sys.stderr.write("%s\n" % cmd) sys.stderr.write("\t -h : Print this help message\n") sys.stderr.write("\t -a : Print all possible stats\n") sys.stderr.write("\t -v : List all possible field headers and definitions" "\n") sys.stderr.write("\t -x : Print extended stats\n") sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") sys.stderr.write("\t -o : Redirect output to the specified file\n") sys.stderr.write("\t -s : Override default field separator with custom " "character or string\n") sys.stderr.write("\t -p : Disable auto-scaling of numerical fields\n") sys.stderr.write("\nExamples:\n") sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") sys.stderr.write("\tarcstat -v\n") sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") sys.stderr.write("\n") sys.exit(1) def snap_stats(): global cur global kstat prev = copy.deepcopy(cur) kstat_update() cur = kstat for key in cur: if re.match(key, "class"): continue if key in prev: d[key] = cur[key] - prev[key] else: d[key] = cur[key] def prettynum(sz, scale, num=0): suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] index = 0 save = 0 # Special case for date field if scale == -1: return "%s" % num # Rounding error, return 0 elif 0 < num < 1: num = 0 while abs(num) > scale and index < 5: save = num num = num / scale index += 1 if index == 0: return "%*d" % (sz, num) if abs(save / scale) < 10: return "%*.1f%s" % (sz - 1, num, suffix[index]) else: return "%*d%s" % (sz - 1, num, suffix[index]) def print_values(): global hdr global sep global v global pretty_print if pretty_print: fmt = lambda col: prettynum(cols[col][0], cols[col][1], v[col]) else: fmt = lambda col: v[col] sys.stdout.write(sep.join(fmt(col) for col in hdr)) sys.stdout.write("\n") sys.stdout.flush() def print_header(): global hdr global sep global pretty_print if pretty_print: fmt = lambda col: "%*s" % (cols[col][0], col) else: fmt = lambda col: col sys.stdout.write(sep.join(fmt(col) for col in hdr)) sys.stdout.write("\n") def get_terminal_lines(): try: import fcntl import termios import struct data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234') sz = struct.unpack('hh', data) return sz[0] except Exception: pass def update_hdr_intr(): global hdr_intr lines = get_terminal_lines() if lines and lines > 3: hdr_intr = lines - 3 def resize_handler(signum, frame): update_hdr_intr() def init(): global sint global count global hdr global xhdr global opfile global sep global out global l2exist global pretty_print desired_cols = None aflag = False xflag = False hflag = False vflag = False i = 1 try: opts, args = getopt.getopt( sys.argv[1:], "axo:hvs:f:p", [ "all", "extended", "outfile", "help", "verbose", "separator", "columns", "parsable" ] ) except getopt.error as msg: sys.stderr.write("Error: %s\n" % str(msg)) usage() opts = None for opt, arg in opts: if opt in ('-a', '--all'): aflag = True if opt in ('-x', '--extended'): xflag = True if opt in ('-o', '--outfile'): opfile = arg i += 1 if opt in ('-h', '--help'): hflag = True if opt in ('-v', '--verbose'): vflag = True if opt in ('-s', '--separator'): sep = arg i += 1 if opt in ('-f', '--columns'): desired_cols = arg i += 1 if opt in ('-p', '--parsable'): pretty_print = False i += 1 argv = sys.argv[i:] sint = int(argv[0]) if argv else sint count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) if hflag or (xflag and desired_cols): usage() if vflag: detailed_usage() if xflag: hdr = xhdr update_hdr_intr() # check if L2ARC exists snap_stats() l2_size = cur.get("l2_size") if l2_size: l2exist = True if desired_cols: hdr = desired_cols.split(",") invalid = [] incompat = [] for ele in hdr: if ele not in cols: invalid.append(ele) elif not l2exist and ele.startswith("l2"): sys.stdout.write("No L2ARC Here\n%s\n" % ele) incompat.append(ele) if len(invalid) > 0: sys.stderr.write("Invalid column definition! -- %s\n" % invalid) usage() if len(incompat) > 0: sys.stderr.write("Incompatible field specified! -- %s\n" % incompat) usage() if aflag: if l2exist: hdr = cols.keys() else: hdr = [col for col in cols.keys() if not col.startswith("l2")] if opfile: try: out = open(opfile, "w") sys.stdout = out except IOError: sys.stderr.write("Cannot open %s for writing\n" % opfile) sys.exit(1) def calculate(): global d global v global l2exist v = dict() v["time"] = time.strftime("%H:%M:%S", time.localtime()) - v["hits"] = d["hits"] / sint - v["miss"] = d["misses"] / sint + v["hits"] = d["hits"] // sint + v["miss"] = d["misses"] // sint v["read"] = v["hits"] + v["miss"] - v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0 + v["hit%"] = 100 * v["hits"] // v["read"] if v["read"] > 0 else 0 v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0 - v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint - v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint + v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) // sint + v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) // sint v["dread"] = v["dhit"] + v["dmis"] - v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0 + v["dh%"] = 100 * v["dhit"] // v["dread"] if v["dread"] > 0 else 0 v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0 - v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint + v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) // sint v["pmis"] = (d["prefetch_data_misses"] + - d["prefetch_metadata_misses"]) / sint + d["prefetch_metadata_misses"]) // sint v["pread"] = v["phit"] + v["pmis"] - v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0 + v["ph%"] = 100 * v["phit"] // v["pread"] if v["pread"] > 0 else 0 v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0 v["mhit"] = (d["prefetch_metadata_hits"] + - d["demand_metadata_hits"]) / sint + d["demand_metadata_hits"]) // sint v["mmis"] = (d["prefetch_metadata_misses"] + - d["demand_metadata_misses"]) / sint + d["demand_metadata_misses"]) // sint v["mread"] = v["mhit"] + v["mmis"] - v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0 + v["mh%"] = 100 * v["mhit"] // v["mread"] if v["mread"] > 0 else 0 v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0 v["arcsz"] = cur["size"] v["size"] = cur["size"] v["c"] = cur["c"] - v["mfu"] = d["mfu_hits"] / sint - v["mru"] = d["mru_hits"] / sint - v["mrug"] = d["mru_ghost_hits"] / sint - v["mfug"] = d["mfu_ghost_hits"] / sint - v["eskip"] = d["evict_skip"] / sint - v["el2skip"] = d["evict_l2_skip"] / sint - v["el2cach"] = d["evict_l2_cached"] / sint - v["el2el"] = d["evict_l2_eligible"] / sint - v["el2mfu"] = d["evict_l2_eligible_mfu"] / sint - v["el2mru"] = d["evict_l2_eligible_mru"] / sint - v["el2inel"] = d["evict_l2_ineligible"] / sint - v["mtxmis"] = d["mutex_miss"] / sint + v["mfu"] = d["mfu_hits"] // sint + v["mru"] = d["mru_hits"] // sint + v["mrug"] = d["mru_ghost_hits"] // sint + v["mfug"] = d["mfu_ghost_hits"] // sint + v["eskip"] = d["evict_skip"] // sint + v["el2skip"] = d["evict_l2_skip"] // sint + v["el2cach"] = d["evict_l2_cached"] // sint + v["el2el"] = d["evict_l2_eligible"] // sint + v["el2mfu"] = d["evict_l2_eligible_mfu"] // sint + v["el2mru"] = d["evict_l2_eligible_mru"] // sint + v["el2inel"] = d["evict_l2_ineligible"] // sint + v["mtxmis"] = d["mutex_miss"] // sint if l2exist: - v["l2hits"] = d["l2_hits"] / sint - v["l2miss"] = d["l2_misses"] / sint + v["l2hits"] = d["l2_hits"] // sint + v["l2miss"] = d["l2_misses"] // sint v["l2read"] = v["l2hits"] + v["l2miss"] - v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0 + v["l2hit%"] = 100 * v["l2hits"] // v["l2read"] if v["l2read"] > 0 else 0 v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 v["l2asize"] = cur["l2_asize"] v["l2size"] = cur["l2_size"] - v["l2bytes"] = d["l2_read_bytes"] / sint + v["l2bytes"] = d["l2_read_bytes"] // sint v["l2pref"] = cur["l2_prefetch_asize"] v["l2mfu"] = cur["l2_mfu_asize"] v["l2mru"] = cur["l2_mru_asize"] v["l2data"] = cur["l2_bufc_data_asize"] v["l2meta"] = cur["l2_bufc_metadata_asize"] - v["l2pref%"] = 100 * v["l2pref"] / v["l2asize"] - v["l2mfu%"] = 100 * v["l2mfu"] / v["l2asize"] - v["l2mru%"] = 100 * v["l2mru"] / v["l2asize"] - v["l2data%"] = 100 * v["l2data"] / v["l2asize"] - v["l2meta%"] = 100 * v["l2meta"] / v["l2asize"] + v["l2pref%"] = 100 * v["l2pref"] // v["l2asize"] + v["l2mfu%"] = 100 * v["l2mfu"] // v["l2asize"] + v["l2mru%"] = 100 * v["l2mru"] // v["l2asize"] + v["l2data%"] = 100 * v["l2data"] // v["l2asize"] + v["l2meta%"] = 100 * v["l2meta"] // v["l2asize"] v["grow"] = 0 if cur["arc_no_grow"] else 1 v["need"] = cur["arc_need_free"] v["free"] = cur["memory_free_bytes"] v["avail"] = cur["memory_available_bytes"] v["waste"] = cur["abd_chunk_waste_size"] def main(): global sint global count global hdr_intr i = 0 count_flag = 0 init() if count > 0: count_flag = 1 signal(SIGINT, SIG_DFL) signal(SIGWINCH, resize_handler) while True: if i == 0: print_header() snap_stats() calculate() print_values() if count_flag == 1: if count <= 1: break count -= 1 i = 0 if i >= hdr_intr else i + 1 time.sleep(sint) if out: out.close() if __name__ == '__main__': main() diff --git a/sys/contrib/openzfs/cmd/vdev_id/vdev_id b/sys/contrib/openzfs/cmd/vdev_id/vdev_id index cad59c93f078..ab9d7326fc7b 100755 --- a/sys/contrib/openzfs/cmd/vdev_id/vdev_id +++ b/sys/contrib/openzfs/cmd/vdev_id/vdev_id @@ -1,789 +1,789 @@ #!/bin/sh # # vdev_id: udev helper to generate user-friendly names for JBOD disks # # This script parses the file /etc/zfs/vdev_id.conf to map a # physical path in a storage topology to a channel name. The # channel name is combined with a disk enclosure slot number to # create an alias that reflects the physical location of the drive. # This is particularly helpful when it comes to tasks like replacing # failed drives. Slot numbers may also be re-mapped in case the # default numbering is unsatisfactory. The drive aliases will be # created as symbolic links in /dev/disk/by-vdev. # # The currently supported topologies are sas_direct and sas_switch. # A multipath mode is supported in which dm-mpath devices are # handled by examining the first-listed running component disk. In # multipath mode the configuration file should contain a channel # definition with the same name for each path to a given enclosure. # # The alias keyword provides a simple way to map already-existing # device symlinks to more convenient names. It is suitable for # small, static configurations or for sites that have some automated # way to generate the mapping file. # # # Some example configuration files are given below. # # # # Example vdev_id.conf - sas_direct. # # # # multipath no # topology sas_direct # phys_per_port 4 # slot bay # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 1 A # channel 85:00.0 0 B # channel 86:00.0 1 C # channel 86:00.0 0 D # # # Custom mapping for Channel A # # # Linux Mapped # # Slot Slot Channel # slot 1 7 A # slot 2 10 A # slot 3 3 A # slot 4 6 A # # # Default mapping for B, C, and D # slot 1 4 # slot 2 2 # slot 3 1 # slot 4 3 # # # # Example vdev_id.conf - sas_switch # # # # topology sas_switch # # # SWITCH PORT CHANNEL NAME # channel 1 A # channel 2 B # channel 3 C # channel 4 D # # # # Example vdev_id.conf - multipath # # # # multipath yes # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 1 A # channel 85:00.0 0 B # channel 86:00.0 1 A # channel 86:00.0 0 B # # # # Example vdev_id.conf - multipath / multijbod-daisychaining # # # # multipath yes # multijbod yes # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 1 A # channel 85:00.0 0 B # channel 86:00.0 1 A # channel 86:00.0 0 B # # # # Example vdev_id.conf - multipath / mixed # # # # multipath yes # slot mix # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 3 A # channel 85:00.0 2 B # channel 86:00.0 3 A # channel 86:00.0 2 B # channel af:00.0 0 C # channel af:00.0 1 C # # # # Example vdev_id.conf - alias # # # # # by-vdev # # name fully qualified or base name of device link # alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca # alias d2 wwn-0x5000c5002def789e PATH=/bin:/sbin:/usr/bin:/usr/sbin CONFIG=/etc/zfs/vdev_id.conf PHYS_PER_PORT= DEV= TOPOLOGY= BAY= ENCL_ID="" UNIQ_ENCL_ID="" usage() { cat << EOF Usage: vdev_id [-h] vdev_id <-d device> [-c config_file] [-p phys_per_port] [-g sas_direct|sas_switch|scsi] [-m] -c specify name of an alternative config file [default=$CONFIG] -d specify basename of device (i.e. sda) -e Create enclose device symlinks only (/dev/by-enclosure) -g Storage network topology [default="$TOPOLOGY"] -m Run in multipath mode -j Run in multijbod mode -p number of phy's per switch port [default=$PHYS_PER_PORT] -h show this summary EOF exit 1 # exit with error to avoid processing usage message by a udev rule } map_slot() { LINUX_SLOT=$1 CHANNEL=$2 MAPPED_SLOT=$(awk -v linux_slot="$LINUX_SLOT" -v channel="$CHANNEL" \ '$1 == "slot" && $2 == linux_slot && \ ($4 ~ "^"channel"$" || $4 ~ /^$/) { print $3; exit}' $CONFIG) if [ -z "$MAPPED_SLOT" ] ; then MAPPED_SLOT=$LINUX_SLOT fi printf "%d" "${MAPPED_SLOT}" } map_channel() { MAPPED_CHAN= PCI_ID=$1 PORT=$2 case $TOPOLOGY in "sas_switch") MAPPED_CHAN=$(awk -v port="$PORT" \ '$1 == "channel" && $2 == port \ { print $3; exit }' $CONFIG) ;; "sas_direct"|"scsi") MAPPED_CHAN=$(awk -v pciID="$PCI_ID" -v port="$PORT" \ '$1 == "channel" && $2 == pciID && $3 == port \ {print $4}' $CONFIG) ;; esac printf "%s" "${MAPPED_CHAN}" } get_encl_id() { set -- $(echo $1) count=$# i=1 while [ $i -le $count ] ; do d=$(eval echo '$'{$i}) id=$(cat "/sys/class/enclosure/${d}/id") ENCL_ID="${ENCL_ID} $id" i=$((i + 1)) done } get_uniq_encl_id() { for uuid in ${ENCL_ID}; do found=0 for count in ${UNIQ_ENCL_ID}; do if [ $count = $uuid ]; then found=1 break fi done if [ $found -eq 0 ]; then UNIQ_ENCL_ID="${UNIQ_ENCL_ID} $uuid" fi done } # map_jbod explainer: The bsg driver knows the difference between a SAS # expander and fanout expander. Use hostX instance along with top-level # (whole enclosure) expander instances in /sys/class/enclosure and # matching a field in an array of expanders, using the index of the # matched array field as the enclosure instance, thereby making jbod IDs # dynamic. Avoids reliance on high overhead userspace commands like # multipath and lsscsi and instead uses existing sysfs data. $HOSTCHAN # variable derived from devpath gymnastics in sas_handler() function. map_jbod() { DEVEXP=$(ls -l "/sys/block/$DEV/device/" | grep enclos | awk -F/ '{print $(NF-1) }') DEV=$1 # Use "set --" to create index values (Arrays) set -- $(ls -l /sys/class/enclosure | grep -v "^total" | awk '{print $9}') # Get count of total elements JBOD_COUNT=$# JBOD_ITEM=$* # Build JBODs (enclosure) id from sys/class/enclosure//id get_encl_id "$JBOD_ITEM" # Different expander instances for each paths. # Filter out and keep only unique id. get_uniq_encl_id # Identify final 'mapped jbod' j=0 for count in ${UNIQ_ENCL_ID}; do i=1 j=$((j + 1)) while [ $i -le $JBOD_COUNT ] ; do d=$(eval echo '$'{$i}) id=$(cat "/sys/class/enclosure/${d}/id") if [ "$d" = "$DEVEXP" ] && [ $id = $count ] ; then MAPPED_JBOD=$j break fi i=$((i + 1)) done done printf "%d" "${MAPPED_JBOD}" } sas_handler() { if [ -z "$PHYS_PER_PORT" ] ; then PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then MULTIPATH_MODE=$(awk '$1 == "multipath" \ {print $2; exit}' $CONFIG) fi if [ -z "$MULTIJBOD_MODE" ] ; then MULTIJBOD_MODE=$(awk '$1 == "multijbod" \ {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then DM_NAME=$(ls -l --full-time /dev/mapper | grep "$DEV"$ | awk '{print $9}') fi # For raw disks udev exports DEVTYPE=partition when # handling partitions, and the rules can be written to # take advantage of this to append a -part suffix. For # dm devices we get DEVTYPE=disk even for partitions so # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then # Match p[number], remove the 'p' and prepend "-part" PART=$(echo "$DM_NAME" | awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi # Strip off partition information. DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi # Utilize DM device name to gather subordinate block devices # using sysfs to avoid userspace utilities # If our DEVNAME is something like /dev/dm-177, then we may be # able to get our DMDEV from it. DMDEV=$(echo $DEVNAME | sed 's;/dev/;;g') if [ ! -e /sys/block/$DMDEV/slaves/* ] ; then # It's not there, try looking in /dev/mapper DMDEV=$(ls -l --full-time /dev/mapper | grep $DM_NAME | awk '{gsub("../", " "); print $NF}') fi # Use sysfs pointers in /sys/block/dm-X/slaves because using # userspace tools creates lots of overhead and should be avoided # whenever possible. Use awk to isolate lowest instance of # sd device member in dm device group regardless of string # length. DEV=$(ls "/sys/block/$DMDEV/slaves" | awk ' { len=sprintf ("%20s",length($0)); gsub(/ /,0,str); a[NR]=len "_" $0; } END { asort(a) print substr(a[1],22) }') if [ -z "$DEV" ] ; then return fi fi if echo "$DEV" | grep -q ^/devices/ ; then sys_path=$DEV else sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null) fi # Use positional parameters as an ad-hoc array set -- $(echo "$sys_path" | tr / ' ') num_dirs=$# scsi_host_dir="/sys" # Get path up to /sys/.../hostX i=1 while [ $i -le "$num_dirs" ] ; do d=$(eval echo '$'{$i}) scsi_host_dir="$scsi_host_dir/$d" echo "$d" | grep -q -E '^host[0-9]+$' && break i=$((i + 1)) done # Lets grab the SAS host channel number and save it for JBOD sorting later HOSTCHAN=$(echo "$d" | awk -F/ '{ gsub("host","",$NF); print $NF}') if [ $i = "$num_dirs" ] ; then return fi PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}') # In sas_switch mode, the directory four levels beneath # /sys/.../hostX contains symlinks to phy devices that reveal # the switch port number. In sas_direct mode, the phy links one # directory down reveal the HBA port. port_dir=$scsi_host_dir case $TOPOLOGY in "sas_switch") j=$((i + 4)) ;; "sas_direct") j=$((i + 1)) ;; esac i=$((i + 1)) while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo '$'{$i})" i=$((i + 1)) done PHY=$(ls -d "$port_dir"/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}') if [ -z "$PHY" ] ; then PHY=0 fi PORT=$((PHY / PHYS_PER_PORT)) # Look in /sys/.../sas_device/end_device-X for the bay_identifier # attribute. end_device_dir=$port_dir while [ $i -lt "$num_dirs" ] ; do d=$(eval echo '$'{$i}) end_device_dir="$end_device_dir/$d" if echo "$d" | grep -q '^end_device' ; then end_device_dir="$end_device_dir/sas_device/$d" break fi i=$((i + 1)) done # Add 'mix' slot type for environments where dm-multipath devices # include end-devices connected via SAS expanders or direct connection # to SAS HBA. A mixed connectivity environment such as pool devices # contained in a SAS JBOD and spare drives or log devices directly # connected in a server backplane without expanders in the I/O path. SLOT= case $BAY in "bay") SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null) ;; "mix") if [ $(cat "$end_device_dir/bay_identifier" 2>/dev/null) ] ; then SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null) else SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null) fi ;; "phy") SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null) ;; "port") d=$(eval echo '$'{$i}) SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "id") i=$((i + 1)) d=$(eval echo '$'{$i}) SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "lun") i=$((i + 2)) d=$(eval echo '$'{$i}) SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "ses") # look for this SAS path in all SCSI Enclosure Services # (SES) enclosures sas_address=$(cat "$end_device_dir/sas_address" 2>/dev/null) enclosures=$(lsscsi -g | \ sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p') for enclosure in $enclosures; do set -- $(sg_ses -p aes "$enclosure" | \ awk "/device slot number:/{slot=\$12} \ /SAS address: $sas_address/\ {print slot}") SLOT=$1 if [ -n "$SLOT" ] ; then break fi done ;; esac if [ -z "$SLOT" ] ; then return fi if [ "$MULTIJBOD_MODE" = "yes" ] ; then CHAN=$(map_channel "$PCI_ID" "$PORT") SLOT=$(map_slot "$SLOT" "$CHAN") JBOD=$(map_jbod "$DEV") if [ -z "$CHAN" ] ; then return fi echo "${CHAN}"-"${JBOD}"-"${SLOT}${PART}" else CHAN=$(map_channel "$PCI_ID" "$PORT") SLOT=$(map_slot "$SLOT" "$CHAN") if [ -z "$CHAN" ] ; then return fi echo "${CHAN}${SLOT}${PART}" fi } scsi_handler() { if [ -z "$FIRST_BAY_NUMBER" ] ; then FIRST_BAY_NUMBER=$(awk '$1 == "first_bay_number" \ {print $2; exit}' $CONFIG) fi FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} if [ -z "$PHYS_PER_PORT" ] ; then PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then MULTIPATH_MODE=$(awk '$1 == "multipath" \ {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then DM_NAME=$(ls -l --full-time /dev/mapper | grep "$DEV"$ | awk '{print $9}') fi # For raw disks udev exports DEVTYPE=partition when # handling partitions, and the rules can be written to # take advantage of this to append a -part suffix. For # dm devices we get DEVTYPE=disk even for partitions so # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then # Match p[number], remove the 'p' and prepend "-part" PART=$(echo "$DM_NAME" | awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi # Strip off partition information. DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi # Get the raw scsi device name from multipath -ll. Strip off # leading pipe symbols to make field numbering consistent. DEV=$(multipath -ll "$DM_NAME" | awk '/running/{gsub("^[|]"," "); print $3 ; exit}') if [ -z "$DEV" ] ; then return fi fi if echo "$DEV" | grep -q ^/devices/ ; then sys_path=$DEV else sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null) fi # expect sys_path like this, for example: # /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv # Use positional parameters as an ad-hoc array set -- $(echo "$sys_path" | tr / ' ') num_dirs=$# scsi_host_dir="/sys" # Get path up to /sys/.../hostX i=1 while [ $i -le "$num_dirs" ] ; do d=$(eval echo '$'{$i}) scsi_host_dir="$scsi_host_dir/$d" echo "$d" | grep -q -E '^host[0-9]+$' && break i=$((i + 1)) done if [ $i = "$num_dirs" ] ; then return fi PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}') # In scsi mode, the directory two levels beneath # /sys/.../hostX reveals the port and slot. port_dir=$scsi_host_dir j=$((i + 2)) i=$((i + 1)) while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo '$'{$i})" i=$((i + 1)) done set -- $(echo "$port_dir" | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') PORT=$1 SLOT=$(($2 + FIRST_BAY_NUMBER)) if [ -z "$SLOT" ] ; then return fi CHAN=$(map_channel "$PCI_ID" "$PORT") SLOT=$(map_slot "$SLOT" "$CHAN") if [ -z "$CHAN" ] ; then return fi echo "${CHAN}${SLOT}${PART}" } # Figure out the name for the enclosure symlink enclosure_handler () { # We get all the info we need from udev's DEVPATH variable: # # DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0 # Get the enclosure ID ("0:0:0:0") ENC=$(basename $(readlink -m "/sys/$DEVPATH/../..")) if [ ! -d "/sys/class/enclosure/$ENC" ] ; then # Not an enclosure, bail out return fi # Get the long sysfs device path to our enclosure. Looks like: # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0 ENC_DEVICE=$(readlink "/sys/class/enclosure/$ENC") # Grab the full path to the hosts port dir: # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0 PORT_DIR=$(echo "$ENC_DEVICE" | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+') # Get the port number PORT_ID=$(echo "$PORT_DIR" | grep -Eo "[0-9]+$") # The PCI directory is two directories up from the port directory # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0 PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../..")) # Strip down the PCI address from 0000:05:00.0 to 05:00.0 PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g') # Name our device according to vdev_id.conf (like "L0" or "U1"). - NAME=$(awk '/channel/{if ($1 == "channel" && $2 == "$PCI_ID" && \ - $3 == "$PORT_ID") {print ${4}int(count[$4])}; count[$4]++}' $CONFIG) + NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \ + \$3 == \"$PORT_ID\") {print \$4\$3}}" $CONFIG) echo "${NAME}" } alias_handler () { # Special handling is needed to correctly append a -part suffix # to partitions of device mapper devices. The DEVTYPE attribute # is normally set to "disk" instead of "partition" in this case, # so the udev rules won't handle that for us as they do for # "plain" block devices. # # For example, we may have the following links for a device and its # partitions, # # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0 -> ../../dm-0 # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1 # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3 # # and the following alias in vdev_id.conf. # # alias A0 dm-name-isw_dibgbfcije_ARRAY0 # # The desired outcome is for the following links to be created # without having explicitly defined aliases for the partitions. # # /dev/disk/by-vdev/A0 -> ../../dm-0 # /dev/disk/by-vdev/A0-part1 -> ../../dm-1 # /dev/disk/by-vdev/A0-part2 -> ../../dm-3 # # Warning: The following grep pattern will misidentify whole-disk # devices whose names end with 'p' followed by a string of # digits as partitions, causing alias creation to fail. This # ambiguity seems unavoidable, so devices using this facility # must not use such names. DM_PART= if echo "$DM_NAME" | grep -q -E 'p[0-9][0-9]*$' ; then if [ "$DEVTYPE" != "partition" ] ; then # Match p[number], remove the 'p' and prepend "-part" DM_PART=$(echo "$DM_NAME" | awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi fi # DEVLINKS attribute must have been populated by already-run udev rules. for link in $DEVLINKS ; do # Remove partition information to match key of top-level device. if [ -n "$DM_PART" ] ; then link=$(echo "$link" | sed 's/p[0-9][0-9]*$//') fi # Check both the fully qualified and the base name of link. for l in $link $(basename "$link") ; do if [ ! -z "$l" ]; then alias=$(awk -v var="$l" '($1 == "alias") && \ ($3 == var) \ { print $2; exit }' $CONFIG) if [ -n "$alias" ] ; then echo "${alias}${DM_PART}" return fi fi done done } # main while getopts 'c:d:eg:jmp:h' OPTION; do case ${OPTION} in c) CONFIG=${OPTARG} ;; d) DEV=${OPTARG} ;; e) # When udev sees a scsi_generic device, it calls this script with -e to # create the enclosure device symlinks only. We also need # "enclosure_symlinks yes" set in vdev_id.config to actually create the # symlink. ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") \ print $2}' "$CONFIG") if [ "$ENCLOSURE_MODE" != "yes" ] ; then exit 0 fi ;; g) TOPOLOGY=$OPTARG ;; p) PHYS_PER_PORT=${OPTARG} ;; j) MULTIJBOD_MODE=yes ;; m) MULTIPATH_MODE=yes ;; h) usage ;; esac done if [ ! -r "$CONFIG" ] ; then echo "Error: Config file \"$CONFIG\" not found" exit 1 fi if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then echo "Error: missing required option -d" exit 1 fi if [ -z "$TOPOLOGY" ] ; then TOPOLOGY=$(awk '($1 == "topology") {print $2; exit}' "$CONFIG") fi if [ -z "$BAY" ] ; then BAY=$(awk '($1 == "slot") {print $2; exit}' "$CONFIG") fi TOPOLOGY=${TOPOLOGY:-sas_direct} # Should we create /dev/by-enclosure symlinks? if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then ID_ENCLOSURE=$(enclosure_handler) if [ -z "$ID_ENCLOSURE" ] ; then exit 0 fi # Just create the symlinks to the enclosure devices and then exit. ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' "$CONFIG") if [ -z "$ENCLOSURE_PREFIX" ] ; then ENCLOSURE_PREFIX="enc" fi echo "ID_ENCLOSURE=$ID_ENCLOSURE" echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE" exit 0 fi # First check if an alias was defined for this device. ID_VDEV=$(alias_handler) if [ -z "$ID_VDEV" ] ; then BAY=${BAY:-bay} case $TOPOLOGY in sas_direct|sas_switch) ID_VDEV=$(sas_handler) ;; scsi) ID_VDEV=$(scsi_handler) ;; *) echo "Error: unknown topology $TOPOLOGY" exit 1 ;; esac fi if [ -n "$ID_VDEV" ] ; then echo "ID_VDEV=${ID_VDEV}" echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}" fi diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 797cac0874fe..81b9604f3789 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -1,8827 +1,8828 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. * Copyright (c) 2015, 2017, Intel Corporation. * Copyright (c) 2020 Datto Inc. * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zdb.h" #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ zio_compress_table[(idx)].ci_name : "UNKNOWN") #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ DMU_OT_ZAP_OTHER : \ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) /* Some platforms require part of inode IDs to be remapped */ #ifdef __APPLE__ #define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) #else #define ZDB_MAP_OBJECT_ID(obj) (obj) #endif static char * zdb_ot_name(dmu_object_type_t type) { if (type < DMU_OT_NUMTYPES) return (dmu_ot[type].ot_name); else if ((type & DMU_OT_NEWTYPE) && ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); else return ("UNKNOWN"); } extern int reference_tracking_enable; extern int zfs_recover; extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern int zfs_reconstruct_indirect_combinations_max; extern int zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); uint64_t *zopt_metaslab = NULL; static unsigned zopt_metaslab_args = 0; typedef struct zopt_object_range { uint64_t zor_obj_start; uint64_t zor_obj_end; uint64_t zor_flags; } zopt_object_range_t; zopt_object_range_t *zopt_object_ranges = NULL; static unsigned zopt_object_args = 0; static int flagbits[256]; #define ZOR_FLAG_PLAIN_FILE 0x0001 #define ZOR_FLAG_DIRECTORY 0x0002 #define ZOR_FLAG_SPACE_MAP 0x0004 #define ZOR_FLAG_ZAP 0x0008 #define ZOR_FLAG_ALL_TYPES -1 #define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ ZOR_FLAG_DIRECTORY | \ ZOR_FLAG_SPACE_MAP | \ ZOR_FLAG_ZAP) #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_RAW 0x0020 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 #define ZDB_FLAG_VERBOSE 0x0080 uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; static range_tree_t *mos_refd_objs; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); static void mos_obj_refd(uint64_t); static void mos_obj_refd_multiple(uint64_t); static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx); typedef struct sublivelist_verify { /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ zfs_btree_t sv_pair; /* ALLOC's without a matching FREE, accumulates across sub-livelists */ zfs_btree_t sv_leftover; } sublivelist_verify_t; static int livelist_compare(const void *larg, const void *rarg) { const blkptr_t *l = larg; const blkptr_t *r = rarg; /* Sort them according to dva[0] */ uint64_t l_dva0_vdev, r_dva0_vdev; l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); if (l_dva0_vdev < r_dva0_vdev) return (-1); else if (l_dva0_vdev > r_dva0_vdev) return (+1); /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset; uint64_t r_dva0_offset; l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); if (l_dva0_offset < r_dva0_offset) { return (-1); } else if (l_dva0_offset > r_dva0_offset) { return (+1); } /* * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, * it's possible the offsets are equal. In that case, sort by txg */ if (l->blk_birth < r->blk_birth) { return (-1); } else if (l->blk_birth > r->blk_birth) { return (+1); } return (0); } typedef struct sublivelist_verify_block { dva_t svb_dva; /* * We need this to check if the block marked as allocated * in the livelist was freed (and potentially reallocated) * in the metaslab spacemaps at a later TXG. */ uint64_t svb_allocated_txg; } sublivelist_verify_block_t; static void zdb_print_blkptr(const blkptr_t *bp, int flags); typedef struct sublivelist_verify_block_refcnt { /* block pointer entry in livelist being verified */ blkptr_t svbr_blk; /* * Refcount gets incremented to 1 when we encounter the first * FREE entry for the svfbr block pointer and a node for it * is created in our ZDB verification/tracking metadata. * * As we encounter more FREE entries we increment this counter * and similarly decrement it whenever we find the respective * ALLOC entries for this block. * * When the refcount gets to 0 it means that all the FREE and * ALLOC entries of this block have paired up and we no longer * need to track it in our verification logic (e.g. the node * containing this struct in our verification data structure * should be freed). * * [refer to sublivelist_verify_blkptr() for the actual code] */ uint32_t svbr_refcnt; } sublivelist_verify_block_refcnt_t; static int sublivelist_block_refcnt_compare(const void *larg, const void *rarg) { const sublivelist_verify_block_refcnt_t *l = larg; const sublivelist_verify_block_refcnt_t *r = rarg; return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); } static int sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx) { ASSERT3P(tx, ==, NULL); struct sublivelist_verify *sv = arg; sublivelist_verify_block_refcnt_t current = { .svbr_blk = *bp, /* * Start with 1 in case this is the first free entry. * This field is not used for our B-Tree comparisons * anyway. */ .svbr_refcnt = 1, }; zfs_btree_index_t where; sublivelist_verify_block_refcnt_t *pair = zfs_btree_find(&sv->sv_pair, ¤t, &where); if (free) { if (pair == NULL) { /* first free entry for this block pointer */ zfs_btree_add(&sv->sv_pair, ¤t); } else { pair->svbr_refcnt++; } } else { if (pair == NULL) { /* block that is currently marked as allocated */ for (int i = 0; i < SPA_DVAS_PER_BP; i++) { if (DVA_IS_EMPTY(&bp->blk_dva[i])) break; sublivelist_verify_block_t svb = { .svb_dva = bp->blk_dva[i], .svb_allocated_txg = bp->blk_birth }; if (zfs_btree_find(&sv->sv_leftover, &svb, &where) == NULL) { zfs_btree_add_idx(&sv->sv_leftover, &svb, &where); } } } else { /* alloc matches a free entry */ pair->svbr_refcnt--; if (pair->svbr_refcnt == 0) { /* all allocs and frees have been matched */ zfs_btree_remove_idx(&sv->sv_pair, &where); } } } return (0); } static int sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) { int err; struct sublivelist_verify *sv = args; zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, sv, NULL); sublivelist_verify_block_refcnt_t *e; zfs_btree_index_t *cookie = NULL; while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &e->svbr_blk, B_TRUE); (void) printf("\tERROR: %d unmatched FREE(s): %s\n", e->svbr_refcnt, blkbuf); } zfs_btree_destroy(&sv->sv_pair); return (err); } static int livelist_block_compare(const void *larg, const void *rarg) { const sublivelist_verify_block_t *l = larg; const sublivelist_verify_block_t *r = rarg; if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) return (-1); else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) return (+1); if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) return (-1); else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) return (+1); if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) return (-1); else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) return (+1); return (0); } /* * Check for errors in a livelist while tracking all unfreed ALLOCs in the * sublivelist_verify_t: sv->sv_leftover */ static void livelist_verify(dsl_deadlist_t *dl, void *arg) { sublivelist_verify_t *sv = arg; dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); } /* * Check for errors in the livelist entry and discard the intermediary * data structures */ /* ARGSUSED */ static int sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) { sublivelist_verify_t sv; zfs_btree_create(&sv.sv_leftover, livelist_block_compare, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); zfs_btree_destroy(&sv.sv_leftover); return (err); } typedef struct metaslab_verify { /* * Tree containing all the leftover ALLOCs from the livelists * that are part of this metaslab. */ zfs_btree_t mv_livelist_allocs; /* * Metaslab information. */ uint64_t mv_vdid; uint64_t mv_msid; uint64_t mv_start; uint64_t mv_end; /* * What's currently allocated for this metaslab. */ range_tree_t *mv_allocated; } metaslab_verify_t; typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg); typedef struct unflushed_iter_cb_arg { spa_t *uic_spa; uint64_t uic_txg; void *uic_arg; zdb_log_sm_cb_t uic_cb; } unflushed_iter_cb_arg_t; static int iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) { unflushed_iter_cb_arg_t *uic = arg; return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); } static void iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); unflushed_iter_cb_arg_t uic = { .uic_spa = spa, .uic_txg = sls->sls_txg, .uic_arg = arg, .uic_cb = cb }; VERIFY0(space_map_iterate(sm, space_map_length(sm), iterate_through_spacemap_logs_cb, &uic)); space_map_close(sm); } spa_config_exit(spa, SCL_CONFIG, FTAG); } static void verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, uint64_t offset, uint64_t size) { sublivelist_verify_block_t svb; DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); DVA_SET_OFFSET(&svb.svb_dva, offset); DVA_SET_ASIZE(&svb.svb_dva, size); zfs_btree_index_t where; uint64_t end_offset = offset + size; /* * Look for an exact match for spacemap entry in the livelist entries. * Then, look for other livelist entries that fall within the range * of the spacemap entry as it may have been condensed */ sublivelist_verify_block_t *found = zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); if (found == NULL) { found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); } for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && DVA_GET_OFFSET(&found->svb_dva) < end_offset; found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { if (found->svb_allocated_txg <= txg) { (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " "from TXG %llx FREED at TXG %llx\n", (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), (u_longlong_t)found->svb_allocated_txg, (u_longlong_t)txg); } } } static int metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) { metaslab_verify_t *mv = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint64_t txg = sme->sme_txg; if (sme->sme_type == SM_ALLOC) { if (range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE ALLOC: " "%llu [%llx:%llx] " "%llu:%llu LOG_SM\n", (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { range_tree_add(mv->mv_allocated, offset, size); } } else { if (!range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE FREE: " "%llu [%llx:%llx] " "%llu:%llu LOG_SM\n", (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { range_tree_remove(mv->mv_allocated, offset, size); } } if (sme->sme_type != SM_ALLOC) { /* * If something is freed in the spacemap, verify that * it is not listed as allocated in the livelist. */ verify_livelist_allocs(mv, txg, offset, size); } return (0); } static int spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { metaslab_verify_t *mv = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* skip indirect vdevs */ if (!vdev_is_concrete(vd)) return (0); if (vdev_id != mv->mv_vdid) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; if (ms->ms_id != mv->mv_msid) return (0); if (txg < metaslab_unflushed_txg(ms)) return (0); ASSERT3U(txg, ==, sme->sme_txg); return (metaslab_spacemap_validation_cb(sme, mv)); } static void spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) { iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); } static void spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) { if (sm == NULL) return; VERIFY0(space_map_iterate(sm, space_map_length(sm), metaslab_spacemap_validation_cb, mv)); } static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); /* * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if * they are part of that metaslab (mv_msid). */ static void mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) { zfs_btree_index_t where; sublivelist_verify_block_t *svb; ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); for (svb = zfs_btree_first(&sv->sv_leftover, &where); svb != NULL; svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) continue; if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && (DVA_GET_OFFSET(&svb->svb_dva) + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { (void) printf("ERROR: Found block that crosses " "metaslab boundary: <%llu:%llx:%llx>\n", (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); continue; } if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) continue; if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) continue; if ((DVA_GET_OFFSET(&svb->svb_dva) + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { (void) printf("ERROR: Found block that crosses " "metaslab boundary: <%llu:%llx:%llx>\n", (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); continue; } zfs_btree_add(&mv->mv_livelist_allocs, svb); } for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); svb != NULL; svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { zfs_btree_remove(&sv->sv_leftover, svb); } } /* * [Livelist Check] * Iterate through all the sublivelists and: * - report leftover frees (**) * - record leftover ALLOCs together with their TXG [see Cross Check] * * (**) Note: Double ALLOCs are valid in datasets that have dedup * enabled. Similarly double FREEs are allowed as well but * only if they pair up with a corresponding ALLOC entry once * we our done with our sublivelist iteration. * * [Spacemap Check] * for each metaslab: * - iterate over spacemap and then the metaslab's entries in the * spacemap log, then report any double FREEs and ALLOCs (do not * blow up). * * [Cross Check] * After finishing the Livelist Check phase and while being in the * Spacemap Check phase, we find all the recorded leftover ALLOCs * of the livelist check that are part of the metaslab that we are * currently looking at in the Spacemap Check. We report any entries * that are marked as ALLOCs in the livelists but have been actually * freed (and potentially allocated again) after their TXG stamp in * the spacemaps. Also report any ALLOCs from the livelists that * belong to indirect vdevs (e.g. their vdev completed removal). * * Note that this will miss Log Spacemap entries that cancelled each other * out before being flushed to the metaslab, so we are not guaranteed * to match all erroneous ALLOCs. */ static void livelist_metaslab_validate(spa_t *spa) { (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; zfs_btree_create(&sv.sv_leftover, livelist_block_compare, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); (void) printf("Verifying metaslab entries\n"); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { metaslab_t *m = vd->vdev_ms[mid]; (void) fprintf(stderr, "\rverifying concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)mid, (longlong_t)vd->vdev_ms_count); uint64_t shift, start; range_seg_type_t type = metaslab_calculate_range_tree_type(vd, m, &start, &shift); metaslab_verify_t mv; mv.mv_allocated = range_tree_create(NULL, type, NULL, start, shift); mv.mv_vdid = vd->vdev_id; mv.mv_msid = m->ms_id; mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, livelist_block_compare, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); spacemap_check_ms_sm(m->ms_sm, &mv); spacemap_check_sm_log(spa, &mv); range_tree_vacate(mv.mv_allocated, NULL, NULL); range_tree_destroy(mv.mv_allocated); zfs_btree_clear(&mv.mv_livelist_allocs); zfs_btree_destroy(&mv.mv_livelist_allocs); } } (void) fprintf(stderr, "\n"); /* * If there are any segments in the leftover tree after we walked * through all the metaslabs in the concrete vdevs then this means * that we have segments in the livelists that belong to indirect * vdevs and are marked as allocated. */ if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { zfs_btree_destroy(&sv.sv_leftover); return; } (void) printf("ERROR: Found livelist blocks marked as allocated " "for indirect vdevs:\n"); zfs_btree_index_t *where = NULL; sublivelist_verify_block_t *svb; while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != NULL) { int vdev_id = DVA_GET_VDEV(&svb->svb_dva); ASSERT3U(vdev_id, <, rvd->vdev_children); vdev_t *vd = rvd->vdev_child[vdev_id]; ASSERT(!vdev_is_concrete(vd)); (void) printf("<%d:%llx:%llx> TXG %llx\n", vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), (u_longlong_t)svb->svb_allocated_txg); } (void) printf("\n"); zfs_btree_destroy(&sv.sv_leftover); } /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void usage(void) { (void) fprintf(stderr, "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p ...]] " "[-I ]\n" "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[[/] [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ]\n" "\t\t[[/] [ ...]\n" "\t%s [-v] \n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" "\t%s -O \n" "\t%s -r \n" "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" "\t\t ::[:]\n" "\t%s -E [-A] word0:word1:...:word15\n" "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); (void) fprintf(stderr, " If dataset name is specified, only that " "dataset is dumped\n"); (void) fprintf(stderr, " If object numbers or object number " "ranges are specified, only those\n" " objects or ranges are dumped.\n\n"); (void) fprintf(stderr, " Object ranges take the form :[:]\n" " start Starting object number\n" " end Ending object number, or -1 for no upper bound\n" " flags Optional flags to select object types:\n" " A All objects (this is the default)\n" " d ZFS directories\n" " f ZFS files \n" " m SPA space maps\n" " z ZAPs\n" " - Negate effect of next flag\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); (void) fprintf(stderr, " -d dataset(s)\n"); (void) fprintf(stderr, " -D dedup statistics\n"); (void) fprintf(stderr, " -E decode and display block from an " "embedded block pointer\n"); (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -l read label contents\n"); (void) fprintf(stderr, " -k examine the checkpointed state " "of the pool\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); (void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -O perform object lookups by path\n"); (void) fprintf(stderr, " -r copy an object by path to file\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all " "others)\n"); (void) fprintf(stderr, " -y perform livelist and metaslab " "validation on any livelists being deleted\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); (void) fprintf(stderr, " -e pool is exported/destroyed/" "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -F attempt automatic rewind within " "safe range of transaction groups\n"); (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " "exiting\n"); (void) fprintf(stderr, " -I -- " "specify the maximum number of\n " "checksumming I/Os [default is 200]\n"); (void) fprintf(stderr, " -o = set global " "variable to an unsigned 32-bit integer\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -q don't print label contents\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -U -- use alternate " "cachefile\n"); (void) fprintf(stderr, " -V do verbatim import\n"); (void) fprintf(stderr, " -x -- " "dump all read blocks into specified directory\n"); (void) fprintf(stderr, " -X attempt extreme rewind (does not " "work with dataset)\n"); (void) fprintf(stderr, " -Y attempt all reconstruction " "combinations for split blocks\n"); (void) fprintf(stderr, " -Z show ZSTD headers \n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } static void dump_debug_buffer(void) { if (dump_opt['G']) { (void) printf("\n"); (void) fflush(stdout); zfs_dbgmsg_print("zdb"); } } /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. */ static void fatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) fprintf(stderr, "%s: ", cmdname); (void) vfprintf(stderr, fmt, ap); va_end(ap); (void) fprintf(stderr, "\n"); dump_debug_buffer(); exit(1); } /* ARGSUSED */ static void dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) { nvlist_t *nv; size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); umem_free(packed, nvsize); dump_nvlist(nv, 8); nvlist_free(nv); } /* ARGSUSED */ static void dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) { spa_history_phys_t *shp = data; if (shp == NULL) return; (void) printf("\t\tpool_create_len = %llu\n", (u_longlong_t)shp->sh_pool_create_len); (void) printf("\t\tphys_max_off = %llu\n", (u_longlong_t)shp->sh_phys_max_off); (void) printf("\t\tbof = %llu\n", (u_longlong_t)shp->sh_bof); (void) printf("\t\teof = %llu\n", (u_longlong_t)shp->sh_eof); (void) printf("\t\trecords_lost = %llu\n", (u_longlong_t)shp->sh_records_lost); } static void zdb_nicenum(uint64_t num, char *buf, size_t buflen) { if (dump_opt['P']) (void) snprintf(buf, buflen, "%llu", (longlong_t)num); else nicenum(num, buf, sizeof (buf)); } static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; static void dump_histogram(const uint64_t *histo, int size, int offset) { int i; int minidx = size - 1; int maxidx = 0; uint64_t max = 0; for (i = 0; i < size; i++) { if (histo[i] > max) max = histo[i]; if (histo[i] > 0 && i > maxidx) maxidx = i; if (histo[i] > 0 && i < minidx) minidx = i; } if (max < histo_width) max = histo_width; for (i = minidx; i <= maxidx; i++) { (void) printf("\t\t\t%3u: %6llu %s\n", i + offset, (u_longlong_t)histo[i], &histo_stars[(max - histo[i]) * histo_width / max]); } } static void dump_zap_stats(objset_t *os, uint64_t object) { int error; zap_stats_t zs; error = zap_get_stats(os, object, &zs); if (error) return; if (zs.zs_ptrtbl_len == 0) { ASSERT(zs.zs_num_blocks == 1); (void) printf("\tmicrozap: %llu bytes, %llu entries\n", (u_longlong_t)zs.zs_blocksize, (u_longlong_t)zs.zs_num_entries); return; } (void) printf("\tFat ZAP stats:\n"); (void) printf("\t\tPointer table:\n"); (void) printf("\t\t\t%llu elements\n", (u_longlong_t)zs.zs_ptrtbl_len); (void) printf("\t\t\tzt_blk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_blk); (void) printf("\t\t\tzt_numblks: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_numblks); (void) printf("\t\t\tzt_shift: %llu\n", (u_longlong_t)zs.zs_ptrtbl_zt_shift); (void) printf("\t\t\tzt_blks_copied: %llu\n", (u_longlong_t)zs.zs_ptrtbl_blks_copied); (void) printf("\t\t\tzt_nextblk: %llu\n", (u_longlong_t)zs.zs_ptrtbl_nextblk); (void) printf("\t\tZAP entries: %llu\n", (u_longlong_t)zs.zs_num_entries); (void) printf("\t\tLeaf blocks: %llu\n", (u_longlong_t)zs.zs_num_leafs); (void) printf("\t\tTotal blocks: %llu\n", (u_longlong_t)zs.zs_num_blocks); (void) printf("\t\tzap_block_type: 0x%llx\n", (u_longlong_t)zs.zs_block_type); (void) printf("\t\tzap_magic: 0x%llx\n", (u_longlong_t)zs.zs_magic); (void) printf("\t\tzap_salt: 0x%llx\n", (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ static void dump_none(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) { (void) printf("\tUNKNOWN OBJECT TYPE\n"); } /*ARGSUSED*/ static void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) { uint64_t *arr; uint64_t oursize; if (dump_opt['d'] < 6) return; if (data == NULL) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, object, &doi)); size = doi.doi_max_offset; /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the * object is extremely large. */ oursize = MIN(size, 1 << 20); arr = kmem_alloc(oursize, KM_SLEEP); int err = dmu_read(os, object, 0, oursize, arr, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(arr, oursize); return; } } else { /* * Even though the allocation is already done in this code path, * we still cap the size to prevent excessive printing. */ oursize = MIN(size, 1 << 20); arr = data; } if (size == 0) { (void) printf("\t\t[]\n"); return; } (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { if (i % 4 != 0) (void) printf(", %0llx", (u_longlong_t)arr[i]); else (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); } if (oursize != size) (void) printf(", ... "); (void) printf("]\n"); if (data == NULL) kmem_free(arr, oursize); } /*ARGSUSED*/ static void dump_zap(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; void *prop; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } prop = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); (void) zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, prop); if (attr.za_integer_length == 1) { if (strcmp(attr.za_name, DSL_CRYPTO_KEY_MASTER_KEY) == 0 || strcmp(attr.za_name, DSL_CRYPTO_KEY_HMAC_KEY) == 0 || strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 || strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 || strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) { uint8_t *u8 = prop; for (i = 0; i < attr.za_num_integers; i++) { (void) printf("%02x", u8[i]); } } else { (void) printf("%s", (char *)prop); } } else { for (i = 0; i < attr.za_num_integers; i++) { switch (attr.za_integer_length) { case 2: (void) printf("%u ", ((uint16_t *)prop)[i]); break; case 4: (void) printf("%u ", ((uint32_t *)prop)[i]); break; case 8: (void) printf("%lld ", (u_longlong_t)((int64_t *)prop)[i]); break; } } } (void) printf("\n"); umem_free(prop, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } static void dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) { bpobj_phys_t *bpop = data; uint64_t i; char bytes[32], comp[32], uncomp[32]; /* make sure the output won't get truncated */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (bpop == NULL) return; zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); (void) printf("\t\tnum_blkptrs = %llu\n", (u_longlong_t)bpop->bpo_num_blkptrs); (void) printf("\t\tbytes = %s\n", bytes); if (size >= BPOBJ_SIZE_V1) { (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } if (size >= BPOBJ_SIZE_V2) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } if (size >= sizeof (*bpop)) { (void) printf("\t\tnum_freed = %llu\n", (u_longlong_t)bpop->bpo_num_freed); } if (dump_opt['d'] < 5) return; for (i = 0; i < bpop->bpo_num_blkptrs; i++) { char blkbuf[BP_SPRINTF_LEN]; blkptr_t bp; int err = dmu_read(os, object, i * sizeof (bp), sizeof (bp), &bp, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); break; } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, BP_GET_FREE(&bp)); (void) printf("\t%s\n", blkbuf); } } /* ARGSUSED */ static void dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) { dmu_object_info_t doi; int64_t i; VERIFY0(dmu_object_info(os, object, &doi)); uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(subobjs, doi.doi_max_offset); return; } int64_t last_nonzero = -1; for (i = 0; i < doi.doi_max_offset / 8; i++) { if (subobjs[i] != 0) last_nonzero = i; } for (i = 0; i <= last_nonzero; i++) { (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); } kmem_free(subobjs, doi.doi_max_offset); } /*ARGSUSED*/ static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) { dump_zap_stats(os, object); /* contents are printed elsewhere, properly decoded */ } /*ARGSUSED*/ static void dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = ", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } (void) printf(" %llx : [%d:%d:%d]\n", (u_longlong_t)attr.za_first_integer, (int)ATTR_LENGTH(attr.za_first_integer), (int)ATTR_BSWAP(attr.za_first_integer), (int)ATTR_NUM(attr.za_first_integer)); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; uint16_t *layout_attrs; unsigned i; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = [", attr.za_name); if (attr.za_num_integers == 0) { (void) printf("\n"); continue; } VERIFY(attr.za_integer_length == 2); layout_attrs = umem_zalloc(attr.za_num_integers * attr.za_integer_length, UMEM_NOFAIL); VERIFY(zap_lookup(os, object, attr.za_name, attr.za_integer_length, attr.za_num_integers, layout_attrs) == 0); for (i = 0; i != attr.za_num_integers; i++) (void) printf(" %d ", (int)layout_attrs[i]); (void) printf("]\n"); umem_free(layout_attrs, attr.za_num_integers * attr.za_integer_length); } zap_cursor_fini(&zc); } /*ARGSUSED*/ static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; zap_attribute_t attr; const char *typenames[] = { /* 0 */ "not specified", /* 1 */ "FIFO", /* 2 */ "Character Device", /* 3 */ "3 (invalid)", /* 4 */ "Directory", /* 5 */ "5 (invalid)", /* 6 */ "Block Device", /* 7 */ "7 (invalid)", /* 8 */ "Regular File", /* 9 */ "9 (invalid)", /* 10 */ "Symbolic Link", /* 11 */ "11 (invalid)", /* 12 */ "Socket", /* 13 */ "Door", /* 14 */ "Event Port", /* 15 */ "15 (invalid)", }; dump_zap_stats(os, object); (void) printf("\n"); for (zap_cursor_init(&zc, os, object); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { (void) printf("\t\t%s = %lld (type: %s)\n", attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); } zap_cursor_fini(&zc); } static int get_dtl_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_ops->vdev_op_leaf) { space_map_t *sm = vd->vdev_dtl_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) return (1); return (0); } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_dtl_refcount(vd->vdev_child[c]); return (refcount); } static int get_metaslab_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd) { for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) refcount++; } } for (unsigned c = 0; c < vd->vdev_children; c++) refcount += get_metaslab_refcount(vd->vdev_child[c]); return (refcount); } static int get_obsolete_refcount(vdev_t *vd) { uint64_t obsolete_sm_object; int refcount = 0; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (vd->vdev_top == vd && obsolete_sm_object != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, obsolete_sm_object, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { refcount++; } } else { ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); ASSERT3U(obsolete_sm_object, ==, 0); } for (unsigned c = 0; c < vd->vdev_children; c++) { refcount += get_obsolete_refcount(vd->vdev_child[c]); } return (refcount); } static int get_prev_obsolete_spacemap_refcount(spa_t *spa) { uint64_t prev_obj = spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; if (prev_obj != 0) { dmu_object_info_t doi; VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { return (1); } } return (0); } static int get_checkpoint_refcount(vdev_t *vd) { int refcount = 0; if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && zap_contains(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) refcount++; for (uint64_t c = 0; c < vd->vdev_children; c++) refcount += get_checkpoint_refcount(vd->vdev_child[c]); return (refcount); } static int get_log_spacemap_refcount(spa_t *spa) { return (avl_numnodes(&spa->spa_sm_logs_by_txg)); } static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; uint64_t actual_refcount; (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); actual_refcount += get_log_spacemap_refcount(spa); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " "actual %lld\n", (longlong_t)expected_refcount, (longlong_t)actual_refcount); return (2); } return (0); } static void dump_spacemap(objset_t *os, space_map_t *sm) { const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; (void) printf("space map object %llu:\n", (longlong_t)sm->sm_object); (void) printf(" smp_length = 0x%llx\n", (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); if (dump_opt['d'] < 6 && dump_opt['m'] < 4) return; /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; uint64_t word, entry_id = 0; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); if (de_txg == 0) { (void) printf( "\t [%6llu] PADDING\n", (u_longlong_t)entry_id); } else { (void) printf( "\t [%6llu] %s: txg %llu pass %llu\n", (u_longlong_t)entry_id, ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)de_txg, (u_longlong_t)de_sync_pass); } entry_id++; continue; } uint8_t words; char entry_type; uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; if (sm_entry_is_single_word(word)) { entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM_OFFSET_DECODE(word) << mapshift) + sm->sm_start; entry_run = SM_RUN_DECODE(word) << mapshift; words = 1; } else { /* it is a two-word entry so we read another word */ ASSERT(sm_entry_is_double_word(word)); uint64_t extra_word; offset += sizeof (extra_word); VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (extra_word), &extra_word, DMU_READ_PREFETCH)); ASSERT3U(offset, <=, space_map_length(sm)); entry_run = SM2_RUN_DECODE(word) << mapshift; entry_vdev = SM2_VDEV_DECODE(word); entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM2_OFFSET_DECODE(extra_word) << mapshift) + sm->sm_start; words = 2; } (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", (u_longlong_t)entry_id, entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, (u_longlong_t)entry_vdev, words); if (entry_type == 'A') alloc += entry_run; else alloc -= entry_run; entry_id++; } if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; space_map_t *sm = msp->ms_sm; char freebuf[32]; zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, sizeof (freebuf)); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); } if (dump_opt['m'] > 1 && sm != NULL && spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { /* * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } if (vd->vdev_ops == &vdev_draid_ops) ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); else ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", (u_longlong_t)metaslab_unflushed_txg(msp)); } } static void print_vdev_metaslab_header(vdev_t *vd) { vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *bias_str = ""; if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { bias_str = VDEV_ALLOC_BIAS_LOG; } else if (alloc_bias == VDEV_BIAS_SPECIAL) { bias_str = VDEV_ALLOC_BIAS_SPECIAL; } else if (alloc_bias == VDEV_BIAS_DEDUP) { bias_str = VDEV_ALLOC_BIAS_DEDUP; } uint64_t ms_flush_data_obj = 0; if (vd->vdev_top_zap != 0) { int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &ms_flush_data_obj); if (error != ENOENT) { ASSERT0(error); } } (void) printf("\tvdev %10llu %s", (u_longlong_t)vd->vdev_id, bias_str); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", (u_longlong_t)ms_flush_data_obj); } (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %12s\n", "---------------", "-------------------", "---------------", "------------"); } static void dump_metaslab_groups(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; metaslab_class_t *mc = spa_normal_class(spa); uint64_t fragmentation; metaslab_class_histogram_verify(mc); for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" "fragmentation", (u_longlong_t)tvd->vdev_id, (u_longlong_t)tvd->vdev_ms_count); if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { (void) printf("%3s\n", "-"); } else { (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); fragmentation = metaslab_class_fragmentation(mc); if (fragmentation == ZFS_FRAG_INVALID) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void print_vdev_indirect(vdev_t *vd) { vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vim == NULL) { ASSERT3P(vib, ==, NULL); return; } ASSERT3U(vdev_indirect_mapping_object(vim), ==, vic->vic_mapping_object); ASSERT3U(vdev_indirect_births_object(vib), ==, vic->vic_births_object); (void) printf("indirect births obj %llu:\n", (longlong_t)vic->vic_births_object); (void) printf(" vib_count = %llu\n", (longlong_t)vdev_indirect_births_count(vib)); for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { vdev_indirect_birth_entry_phys_t *cur_vibe = &vib->vib_entries[i]; (void) printf("\toffset %llx -> txg %llu\n", (longlong_t)cur_vibe->vibe_offset, (longlong_t)cur_vibe->vibe_phys_birth_txg); } (void) printf("\n"); (void) printf("indirect mapping obj %llu:\n", (longlong_t)vic->vic_mapping_object); (void) printf(" vim_max_offset = 0x%llx\n", (longlong_t)vdev_indirect_mapping_max_offset(vim)); (void) printf(" vim_bytes_mapped = 0x%llx\n", (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); (void) printf(" vim_count = %llu\n", (longlong_t)vdev_indirect_mapping_num_entries(vim)); if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) return; uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; (void) printf("\t<%llx:%llx:%llx> -> " "<%llx:%llx:%llx> (%x obsolete)\n", (longlong_t)vd->vdev_id, (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), counts[i]); } (void) printf("\n"); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; (void) printf("obsolete space map object %llu:\n", (u_longlong_t)obsolete_sm_object); ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, obsolete_sm_object); dump_spacemap(mos, vd->vdev_obsolete_sm); (void) printf("\n"); } } static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); if (!dump_opt['d'] && zopt_metaslab_args > 0) { c = zopt_metaslab[0]; if (c >= children) (void) fatal("bad vdev id: %llu", (u_longlong_t)c); if (zopt_metaslab_args > 1) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); for (m = 1; m < zopt_metaslab_args; m++) { if (zopt_metaslab[m] < vd->vdev_ms_count) dump_metaslab( vd->vdev_ms[zopt_metaslab[m]]); else (void) fprintf(stderr, "bad metaslab " "number %llu\n", (u_longlong_t)zopt_metaslab[m]); } (void) printf("\n"); return; } children = c + 1; } for (; c < children; c++) { vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); print_vdev_indirect(vd); for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } static void dump_log_spacemaps(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; (void) printf("\nLog Space Maps in Pool:\n"); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { space_map_t *sm = NULL; VERIFY0(space_map_open(&sm, spa_meta_objset(spa), sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); (void) printf("Log Spacemap object %llu txg %llu\n", (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); dump_spacemap(spa->spa_meta_objset, sm); space_map_close(sm); } (void) printf("\n"); } static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { const ddt_phys_t *ddp = dde->dde_phys; const ddt_key_t *ddk = &dde->dde_key; const char *types[4] = { "ditto", "single", "double", "triple" }; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; int p; for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); } } static void dump_dedup_ratio(const ddt_stat_t *dds) { double rL, rP, rD, D, dedup, compress, copies; if (dds->dds_blocks == 0) return; rL = (double)dds->dds_ref_lsize; rP = (double)dds->dds_ref_psize; rD = (double)dds->dds_ref_dsize; D = (double)dds->dds_dsize; dedup = rD / D; compress = rL / rP; copies = rD / rP; (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " "dedup * compress / copies = %.2f\n\n", dedup, compress, copies, dedup * compress / copies); } static void dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { char name[DDT_NAMELEN]; ddt_entry_t dde; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; int error; error = ddt_object_info(ddt, type, class, &doi); if (error == ENOENT) return; ASSERT(error == 0); error = ddt_object_count(ddt, type, class, &count); ASSERT(error == 0); if (count == 0) return; dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, (u_longlong_t)(dspace / count), (u_longlong_t)(mspace / count)); if (dump_opt['D'] < 3) return; zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); if (dump_opt['D'] < 4) return; if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) return; (void) printf("%s contents:\n\n", name); while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); ASSERT3U(error, ==, ENOENT); (void) printf("\n"); } static void dump_all_ddts(spa_t *spa) { ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } } } ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_blocks == 0) { (void) printf("All DDTs are empty\n"); return; } (void) printf("\n"); if (dump_opt['D'] > 1) { (void) printf("DDT histogram (aggregated over all DDTs):\n"); ddt_get_dedup_histogram(spa, &ddh_total); zpool_dump_ddt(&dds_total, &ddh_total); } dump_dedup_ratio(&dds_total); } static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, (u_longlong_t)start, (u_longlong_t)(start + size), (u_longlong_t)(size)); } static void dump_dtl(vdev_t *vd, int indent) { spa_t *spa = vd->vdev_spa; boolean_t required; const char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { range_tree_t *rt = vd->vdev_dtl[t]; if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); range_tree_walk(rt, dump_dtl_seg, prefix); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (unsigned c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } static void dump_history(spa_t *spa) { nvlist_t **events = NULL; char *buf; uint64_t resid, len, off = 0; uint_t num = 0; int error; char tbuf[30]; if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", __func__); return; } do { len = SPA_OLD_MAXBLOCKSIZE; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " "error %d\n", error); free(buf); return; } if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; off -= resid; } while (len != 0); (void) printf("\nHistory:\n"); for (unsigned i = 0; i < num; i++) { boolean_t printed = B_FALSE; if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } else { tbuf[0] = '\0'; } if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) { (void) printf("%s %s\n", tbuf, fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD)); } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) { uint64_t ievent; ievent = fnvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT); if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) goto next; (void) printf(" %s [internal %s txg:%ju] %s\n", tbuf, zfs_history_event_names[ievent], fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG), fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) { (void) printf("%s [txg:%ju] %s", tbuf, fnvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG), fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_NAME)); if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(events[i], ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64( events[i], ZPOOL_HIST_DSID)); } (void) printf(" %s\n", fnvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) { (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(events[i], ZPOOL_HIST_IOCTL)); if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(events[i], ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(events[i], ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(events[i], ZPOOL_HIST_ERRNO)); } } else { goto next; } printed = B_TRUE; next: if (dump_opt['h'] > 1) { if (!printed) (void) printf("unrecognized record:\n"); dump_nvlist(events[i], 2); } } free(buf); } /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) { } static uint64_t blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); if (zb->zb_object == 0) return (zb->zb_blkid); return (zb->zb_blkid * BP_GET_LSIZE(bp)); } ASSERT(zb->zb_level >= 0); return ((zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, const blkptr_t *bp) { abd_t *pabd; void *buf; zio_t *zio; zfs_zstdhdr_t zstd_hdr; int error; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) return; if (BP_IS_HOLE(bp)) return; if (BP_IS_EMBEDDED(bp)) { buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); exit(1); } decode_embedded_bp_compressed(bp, buf); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); free(buf); zstd_hdr.c_len = BE_32(zstd_hdr.c_len); zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), zfs_get_hdrlevel(&zstd_hdr)); return; } pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); zio = zio_root(spa, NULL, NULL, 0); /* Decrypt but don't decompress so we can read the compression header */ zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, NULL)); error = zio_wait(zio); if (error) { (void) fprintf(stderr, "read failed: %d\n", error); return; } buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); zstd_hdr.c_len = BE_32(zstd_hdr.c_len); zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:NORMAL", zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), zfs_get_hdrlevel(&zstd_hdr)); abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); } static void snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, boolean_t bp_freed) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; int i; if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); if (bp_freed) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); } return; } if (BP_IS_EMBEDDED(bp)) { (void) sprintf(blkbuf, "EMBEDDED et=%u %llxL/%llxP B=%llu", (int)BPE_GET_ETYPE(bp), (u_longlong_t)BPE_GET_LSIZE(bp), (u_longlong_t)BPE_GET_PSIZE(bp), (u_longlong_t)bp->blk_birth); return; } blkbuf[0] = '\0'; for (i = 0; i < ndvas; i++) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL B=%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)bp->blk_birth); } else { (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); if (bp_freed) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx", (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], (u_longlong_t)bp->blk_cksum.zc_word[2], (u_longlong_t)bp->blk_cksum.zc_word[3]); } } static void print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; if (!BP_IS_EMBEDDED(bp)) { ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { (void) printf(" "); } } snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; if (bp->blk_birth == 0) return (0); print_indirect(spa, bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; ASSERT(!BP_IS_REDACTED(bp)); err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; fill += BP_GET_FILL(cbp); } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); arc_buf_destroy(buf, &buf); } return (err); } /*ARGSUSED*/ static void dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; int j; zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } (void) printf("\n"); } /*ARGSUSED*/ static void dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; char nice[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); if (dd == NULL) return; ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); (void) printf("\t\tparent_dir_obj = %llu\n", (u_longlong_t)dd->dd_parent_obj); (void) printf("\t\torigin_obj = %llu\n", (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); (void) printf("\t\tused_bytes = %s\n", nice); zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); (void) printf("\t\tcompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); (void) printf("\t\tuncompressed_bytes = %s\n", nice); zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); (void) printf("\t\tquota = %s\n", nice); zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); (void) printf("\t\tflags = %llx\n", (u_longlong_t)dd->dd_flags); #define DO(which) \ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ sizeof (nice)); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); DO(CHILD); DO(CHILD_RSRV); DO(REFRSRV); #undef DO (void) printf("\t\tclones = %llu\n", (u_longlong_t)dd->dd_clones); } /*ARGSUSED*/ static void dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dataset_phys_t *ds = data; time_t crtime; char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; /* make sure nicenum has enough space */ CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); if (ds == NULL) return; ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof (uncompressed)); zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); (void) printf("\t\tprev_snap_obj = %llu\n", (u_longlong_t)ds->ds_prev_snap_obj); (void) printf("\t\tprev_snap_txg = %llu\n", (u_longlong_t)ds->ds_prev_snap_txg); (void) printf("\t\tnext_snap_obj = %llu\n", (u_longlong_t)ds->ds_next_snap_obj); (void) printf("\t\tsnapnames_zapobj = %llu\n", (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); (void) printf("\t\tuserrefs_obj = %llu\n", (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); (void) printf("\t\tdeadlist_obj = %llu\n", (u_longlong_t)ds->ds_deadlist_obj); (void) printf("\t\tused_bytes = %s\n", used); (void) printf("\t\tcompressed_bytes = %s\n", compressed); (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); (void) printf("\t\tunique = %s\n", unique); (void) printf("\t\tfsid_guid = %llu\n", (u_longlong_t)ds->ds_fsid_guid); (void) printf("\t\tguid = %llu\n", (u_longlong_t)ds->ds_guid); (void) printf("\t\tflags = %llx\n", (u_longlong_t)ds->ds_flags); (void) printf("\t\tnext_clones_obj = %llu\n", (u_longlong_t)ds->ds_next_clones_obj); (void) printf("\t\tprops_obj = %llu\n", (u_longlong_t)ds->ds_props_obj); (void) printf("\t\tbp = %s\n", blkbuf); } /* ARGSUSED */ static int dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); } static void dump_bptree(objset_t *os, uint64_t obj, const char *name) { char bytes[32]; bptree_phys_t *bt; dmu_buf_t *db; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); dmu_buf_rele(db, FTAG); if (dump_opt['d'] < 5) return; (void) printf("\n"); (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); } /* ARGSUSED */ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); } static void dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) { char bytes[32]; char comp[32]; char uncomp[32]; uint64_t i; /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu freed, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } else { (void) printf(" %*s: object %llu, %llu local " "blkptrs, %llu subobjs in object %llu, " "%s (%s/%s comp)\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); } for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); bpobj_close(&subbpo); } } else { if (bpo->bpo_havefreed) { (void) printf(" %*s: object %llu, %llu blkptrs, " "%llu freed, %s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, (u_longlong_t)bpo->bpo_phys->bpo_num_freed, bytes); } else { (void) printf(" %*s: object %llu, %llu blkptrs, " "%s\n", indent * 8, name, (u_longlong_t)bpo->bpo_object, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } } if (dump_opt['d'] < 5) return; if (indent == 0) { (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); (void) printf("\n"); } } static int dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, boolean_t print_list) { int err = 0; zfs_bookmark_phys_t prop; objset_t *mos = dp->dp_spa->spa_meta_objset; err = dsl_bookmark_lookup(dp, name, NULL, &prop); if (err != 0) { return (err); } (void) printf("\t#%s: ", strchr(name, '#') + 1); (void) printf("{guid: %llx creation_txg: %llu creation_time: " "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, (u_longlong_t)prop.zbm_creation_txg, (u_longlong_t)prop.zbm_creation_time, (u_longlong_t)prop.zbm_redaction_obj); IMPLY(print_list, print_redact); if (!print_redact || prop.zbm_redaction_obj == 0) return (0); redaction_list_t *rl; VERIFY0(dsl_redaction_list_hold_obj(dp, prop.zbm_redaction_obj, FTAG, &rl)); redaction_list_phys_t *rlp = rl->rl_phys; (void) printf("\tRedacted:\n\t\tProgress: "); if (rlp->rlp_last_object != UINT64_MAX || rlp->rlp_last_blkid != UINT64_MAX) { (void) printf("%llu %llu (incomplete)\n", (u_longlong_t)rlp->rlp_last_object, (u_longlong_t)rlp->rlp_last_blkid); } else { (void) printf("complete\n"); } (void) printf("\t\tSnapshots: ["); for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { if (i > 0) (void) printf(", "); (void) printf("%0llu", (u_longlong_t)rlp->rlp_snaps[i]); } (void) printf("]\n\t\tLength: %llu\n", (u_longlong_t)rlp->rlp_num_entries); if (!print_list) { dsl_redaction_list_rele(rl, FTAG); return (0); } if (rlp->rlp_num_entries == 0) { dsl_redaction_list_rele(rl, FTAG); (void) printf("\t\tRedaction List: []\n\n"); return (0); } redact_block_phys_t *rbp_buf; uint64_t size; dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); size = doi.doi_max_offset; rbp_buf = kmem_alloc(size, KM_SLEEP); err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, rbp_buf, 0); if (err != 0) { dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); return (err); } (void) printf("\t\tRedaction List: [{object: %llx, offset: " "%llx, blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[0].rbp_object, (u_longlong_t)rbp_buf[0].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[0])), (u_longlong_t)redact_block_get_count(&rbp_buf[0])); for (size_t i = 1; i < rlp->rlp_num_entries; i++) { (void) printf(",\n\t\t{object: %llx, offset: %llx, " "blksz: %x, count: %llx}", (u_longlong_t)rbp_buf[i].rbp_object, (u_longlong_t)rbp_buf[i].rbp_blkid, (uint_t)(redact_block_get_size(&rbp_buf[i])), (u_longlong_t)redact_block_get_count(&rbp_buf[i])); } dsl_redaction_list_rele(rl, FTAG); kmem_free(rbp_buf, size); (void) printf("]\n\n"); return (0); } static void dump_bookmarks(objset_t *os, int verbosity) { zap_cursor_t zc; zap_attribute_t attr; dsl_dataset_t *ds = dmu_objset_ds(os); dsl_pool_t *dp = spa_get_dsl(os->os_spa); objset_t *mos = os->os_spa->spa_meta_objset; if (verbosity < 4) return; dsl_pool_config_enter(dp, FTAG); for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { char osname[ZFS_MAX_DATASET_NAME_LEN]; char buf[ZFS_MAX_DATASET_NAME_LEN]; dmu_objset_name(os, osname); VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname, attr.za_name)); (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); } zap_cursor_fini(&zc); dsl_pool_config_exit(dp, FTAG); } static void bpobj_count_refd(bpobj_t *bpo) { mos_obj_refd(bpo->bpo_object); if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { mos_obj_refd(bpo->bpo_phys->bpo_subobjs); for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; bpobj_t subbpo; int error; VERIFY0(dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, i * sizeof (subobj), sizeof (subobj), &subobj, 0)); error = bpobj_open(&subbpo, bpo->bpo_os, subobj); if (error != 0) { (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); continue; } bpobj_count_refd(&subbpo); bpobj_close(&subbpo); } } } static int dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) { spa_t *spa = arg; uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dle->dle_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dle->dle_bpobj); return (0); } static int dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) { ASSERT(arg == NULL); if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), "mintxg %llu -> obj %llu", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); dump_full_bpobj(&dle->dle_bpobj, buf, 0); } else { (void) printf("mintxg %llu -> obj %llu\n", (longlong_t)dle->dle_mintxg, (longlong_t)dle->dle_bpobj.bpo_object); } return (0); } static void dump_blkptr_list(dsl_deadlist_t *dl, char *name) { char bytes[32]; char comp[32]; char uncomp[32]; char entries[32]; spa_t *spa = dmu_objset_spa(dl->dl_os); uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); } /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; if (dl->dl_oldfmt) { dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); return; } zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); (void) printf("\n %s: %s (%s/%s comp), %s entries\n", name, bytes, comp, uncomp, entries); if (dump_opt['d'] < 4) return; (void) printf("\n"); dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); } static int verify_dd_livelist(objset_t *os) { uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; dsl_pool_t *dp = spa_get_dsl(os->os_spa); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; ASSERT(!dmu_objset_is_snapshot(os)); if (!dsl_deadlist_is_open(&dd->dd_livelist)) return (0); /* Iterate through the livelist to check for duplicates */ dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, NULL); dsl_pool_config_enter(dp, FTAG); dsl_deadlist_space(&dd->dd_livelist, &ll_used, &ll_comp, &ll_uncomp); dsl_dataset_t *origin_ds; ASSERT(dsl_pool_config_held(dp)); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, &used, &comp, &uncomp)); dsl_dataset_rele(origin_ds, FTAG); dsl_pool_config_exit(dp, FTAG); /* * It's possible that the dataset's uncomp space is larger than the * livelist's because livelists do not track embedded block pointers */ if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { char nice_used[32], nice_comp[32], nice_uncomp[32]; (void) printf("Discrepancy in space accounting:\n"); zdb_nicenum(used, nice_used, sizeof (nice_used)); zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("dir: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); (void) printf("livelist: used %s, comp %s, uncomp %s\n", nice_used, nice_comp, nice_uncomp); return (1); } return (0); } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; static objset_t *sa_os = NULL; static sa_attr_type_t *sa_attr_table = NULL; static int open_objset(const char *path, void *tag, objset_t **osp) { int err; uint64_t sa_attrs = 0; uint64_t version = 0; VERIFY3P(sa_os, ==, NULL); /* * We can't own an objset if it's redacted. Therefore, we do this * dance: hold the objset, then acquire a long hold on its dataset, then * release the pool (which is held as part of holding the objset). */ err = dmu_objset_hold(path, tag, osp); if (err != 0) { (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", path, strerror(err)); return (err); } dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); dsl_pool_rele(dmu_objset_pool(*osp), tag); if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &version); if (version >= ZPL_VERSION_SA) { (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_attrs); } err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, &sa_attr_table); if (err != 0) { (void) fprintf(stderr, "sa_setup failed: %s\n", strerror(err)); dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); dsl_dataset_rele(dmu_objset_ds(*osp), tag); *osp = NULL; } } sa_os = *osp; return (0); } static void close_objset(objset_t *os, void *tag) { VERIFY3P(os, ==, sa_os); if (os->os_sa != NULL) sa_tear_down(os); dsl_dataset_long_rele(dmu_objset_ds(os), tag); dsl_dataset_rele(dmu_objset_ds(os), tag); sa_attr_table = NULL; sa_os = NULL; } static void fuid_table_destroy(void) { if (fuid_table_loaded) { zfs_fuid_table_destroy(&idx_tree, &domain_tree); fuid_table_loaded = B_FALSE; } } /* * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) { if (FUID_INDEX(id)) { char *domain; domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); (void) printf("\t%s %llx [%s-%d]\n", id_type, (u_longlong_t)id, domain, (int)FUID_RID(id)); } else { (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); } } static void dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; } print_idstr(uid, "uid"); print_idstr(gid, "gid"); } static void dump_znode_sa_xattr(sa_handle_t *hdl) { nvlist_t *sa_xattr; nvpair_t *elem = NULL; int sa_xattr_size = 0; int sa_xattr_entries = 0; int error; char *sa_xattr_packed; error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); if (error || sa_xattr_size == 0) return; sa_xattr_packed = malloc(sa_xattr_size); if (sa_xattr_packed == NULL) return; error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], sa_xattr_packed, sa_xattr_size); if (error) { free(sa_xattr_packed); return; } error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); if (error) { free(sa_xattr_packed); return; } while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) sa_xattr_entries++; (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", sa_xattr_size, sa_xattr_entries); while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { uchar_t *value; uint_t cnt, idx; (void) printf("\t\t%s = ", nvpair_name(elem)); nvpair_value_byte_array(elem, &value, &cnt); for (idx = 0; idx < cnt; ++idx) { if (isprint(value[idx])) (void) putchar(value[idx]); else (void) printf("\\%3.3o", value[idx]); } (void) putchar('\n'); } nvlist_free(sa_xattr); free(sa_xattr_packed); } static void dump_znode_symlink(sa_handle_t *hdl) { int sa_symlink_size = 0; char linktarget[MAXPATHLEN]; linktarget[0] = '\0'; int error; error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); if (error || sa_symlink_size == 0) { return; } if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], &linktarget, sa_symlink_size) == 0) (void) printf("\ttarget %s\n", linktarget); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ sa_handle_t *hdl; uint64_t xattr, rdev, gen; uint64_t uid, gid, mode, fsize, parent, links; uint64_t pflags; uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; sa_bulk_attr_t bulk[12]; int idx = 0; int error; VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; } SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, &fsize, 8); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, acctm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, modtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, crtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, chgtm, 16); SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, &pflags, 8); if (sa_bulk_lookup(hdl, bulk, idx)) { (void) sa_handle_destroy(hdl); return; } z_crtime = (time_t)crtm[0]; z_atime = (time_t)acctm[0]; z_mtime = (time_t)modtm[0]; z_ctime = (time_t)chgtm[0]; if (dump_opt['d'] > 4) { error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error == ESTALE) { (void) snprintf(path, sizeof (path), "on delete queue"); } else if (error != 0) { leaked_objects++; (void) snprintf(path, sizeof (path), "path not found, possibly leaked"); } (void) printf("\tpath %s\n", path); } if (S_ISLNK(mode)) dump_znode_symlink(hdl); dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); (void) printf("\tgen %llu\n", (u_longlong_t)gen); (void) printf("\tmode %llo\n", (u_longlong_t)mode); (void) printf("\tsize %llu\n", (u_longlong_t)fsize); (void) printf("\tparent %llu\n", (u_longlong_t)parent); (void) printf("\tlinks %llu\n", (u_longlong_t)links); (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { uint64_t projid; if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, sizeof (uint64_t)) == 0) (void) printf("\tprojid %llu\n", (u_longlong_t)projid); } if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, sizeof (uint64_t)) == 0) (void) printf("\txattr %llu\n", (u_longlong_t)xattr); if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, sizeof (uint64_t)) == 0) (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); dump_znode_sa_xattr(hdl); sa_handle_destroy(hdl); } /*ARGSUSED*/ static void dump_acl(objset_t *os, uint64_t object, void *data, size_t size) { } /*ARGSUSED*/ static void dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { } static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ dump_none, /* bpobj */ dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ dump_dnode, /* DMU dnode */ dump_dmu_objset, /* DMU objset */ dump_dsl_dir, /* DSL directory */ dump_zap, /* DSL directory child map */ dump_zap, /* DSL dataset snap map */ dump_zap, /* DSL props */ dump_dsl_dataset, /* DSL dataset */ dump_znode, /* ZFS znode */ dump_acl, /* ZFS V0 ACL */ dump_uint8, /* ZFS plain file */ dump_zpldir, /* ZFS directory */ dump_zap, /* ZFS master node */ dump_zap, /* ZFS delete queue */ dump_uint8, /* zvol object */ dump_zap, /* zvol prop */ dump_uint8, /* other uint8[] */ dump_uint64, /* other uint64[] */ dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ dump_uint8, /* ZFS SYSACL */ dump_none, /* FUID nvlist */ dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group/project used */ dump_zap, /* ZFS user/group/project quota */ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ dump_znode, /* SA object */ dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ dump_zap, /* deadlist */ dump_none, /* deadlist hdr */ dump_zap, /* dsl clones */ dump_bpobj_subobjs, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; static boolean_t match_object_type(dmu_object_type_t obj_type, uint64_t flags) { boolean_t match = B_TRUE; switch (obj_type) { case DMU_OT_DIRECTORY_CONTENTS: if (!(flags & ZOR_FLAG_DIRECTORY)) match = B_FALSE; break; case DMU_OT_PLAIN_FILE_CONTENTS: if (!(flags & ZOR_FLAG_PLAIN_FILE)) match = B_FALSE; break; case DMU_OT_SPACE_MAP: if (!(flags & ZOR_FLAG_SPACE_MAP)) match = B_FALSE; break; default: if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { if (!(flags & ZOR_FLAG_ZAP)) match = B_FALSE; break; } /* * If all bits except some of the supported flags are * set, the user combined the all-types flag (A) with * a negated flag to exclude some types (e.g. A-f to * show all object types except plain files). */ if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) match = B_FALSE; break; } return (match); } static void dump_object(objset_t *os, uint64_t object, int verbosity, boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; boolean_t dnode_held = B_FALSE; void *bonus = NULL; size_t bsize = 0; char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; char bonus_size[32]; char aux[50]; int error; /* make sure nicenum has enough space */ CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); if (*print_header) { (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", "lsize", "%full", "type"); *print_header = 0; } if (object == 0) { dn = DMU_META_DNODE(os); dmu_object_info_from_dnode(dn, &doi); } else { /* * Encrypted datasets will have sensitive bonus buffers * encrypted. Therefore we cannot hold the bonus buffer and * must hold the dnode itself instead. */ error = dmu_object_info(os, object, &doi); if (error) fatal("dmu_object_info() failed, errno %u", error); if (os->os_encrypted && DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { error = dnode_hold(os, object, FTAG, &dn); if (error) fatal("dnode_hold() failed, errno %u", error); dnode_held = B_TRUE; } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); bonus = db->db_data; bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } } /* * Default to showing all object types if no flags were specified. */ if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && !match_object_type(doi.doi_type, flags)) goto out; if (dnode_slots_used) *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress == ZIO_COMPRESS_INHERIT && ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { const char *compname = NULL; if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), &compname) == 0) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s)", compname); } else { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s-unknown)", ZDB_COMPRESS_NAME(os->os_compress)); } } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); } (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", (u_longlong_t)object, doi.doi_indirection, iblk, dblk, asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", "", "", "", "", "", "", bonus_size, "bonus", zdb_ot_name(doi.doi_bonus_type)); } if (verbosity >= 4) { (void) printf("\tdnode flags: %s%s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? "USERUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? "USEROBJUSED_ACCOUNTED " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); if (!dnode_held) { object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, bonus, bsize); } else { (void) printf("\t\t(bonus encrypted)\n"); } if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) { object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); } else { (void) printf("\t\t(object encrypted)\n"); } *print_header = B_TRUE; } if (verbosity >= 5) dump_indirect(dn); if (verbosity >= 5) { /* * Report the list of segments that comprise the object. */ uint64_t start = 0; uint64_t end; uint64_t blkfill = 1; int minlvl = 1; if (dn->dn_type == DMU_OT_DNODE) { minlvl = 0; blkfill = DNODES_PER_BLOCK; } for (;;) { char segsize[32]; /* make sure nicenum has enough space */ CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) break; end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); zdb_nicenum(end - start, segsize, sizeof (segsize)); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); if (error) break; start = end; } } out: if (db != NULL) dmu_buf_rele(db, FTAG); if (dnode_held) dnode_rele(dn, FTAG); } static void count_dir_mos_objects(dsl_dir_t *dd) { mos_obj_refd(dd->dd_object); mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); mos_obj_refd(dsl_dir_phys(dd)->dd_clones); /* * The dd_crypto_obj can be referenced by multiple dsl_dir's. * Ignore the references after the first one. */ mos_obj_refd_multiple(dd->dd_crypto_obj); } static void count_ds_mos_objects(dsl_dataset_t *ds) { mos_obj_refd(ds->ds_object); mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); mos_obj_refd(ds->ds_bookmarks_obj); if (!dsl_dataset_is_snapshot(ds)) { count_dir_mos_objects(ds->ds_dir); } } static const char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; /* * Parse a string denoting a range of object IDs of the form * [:[:flags]], and store the results in zor. * Return 0 on success. On error, return 1 and update the msg * pointer to point to a descriptive error message. */ static int parse_object_range(char *range, zopt_object_range_t *zor, char **msg) { uint64_t flags = 0; char *p, *s, *dup, *flagstr, *tmp = NULL; size_t len; int i; int rc = 0; if (strchr(range, ':') == NULL) { zor->zor_obj_start = strtoull(range, &p, 0); if (*p != '\0') { *msg = "Invalid characters in object ID"; rc = 1; } zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); zor->zor_obj_end = zor->zor_obj_start; return (rc); } if (strchr(range, ':') == range) { *msg = "Invalid leading colon"; rc = 1; return (rc); } len = strlen(range); if (range[len - 1] == ':') { *msg = "Invalid trailing colon"; rc = 1; return (rc); } dup = strdup(range); s = strtok_r(dup, ":", &tmp); zor->zor_obj_start = strtoull(s, &p, 0); if (*p != '\0') { *msg = "Invalid characters in start object ID"; rc = 1; goto out; } s = strtok_r(NULL, ":", &tmp); zor->zor_obj_end = strtoull(s, &p, 0); if (*p != '\0') { *msg = "Invalid characters in end object ID"; rc = 1; goto out; } if (zor->zor_obj_start > zor->zor_obj_end) { *msg = "Start object ID may not exceed end object ID"; rc = 1; goto out; } s = strtok_r(NULL, ":", &tmp); if (s == NULL) { zor->zor_flags = ZOR_FLAG_ALL_TYPES; goto out; } else if (strtok_r(NULL, ":", &tmp) != NULL) { *msg = "Invalid colon-delimited field after flags"; rc = 1; goto out; } flagstr = s; for (i = 0; flagstr[i]; i++) { int bit; boolean_t negation = (flagstr[i] == '-'); if (negation) { i++; if (flagstr[i] == '\0') { *msg = "Invalid trailing negation operator"; rc = 1; goto out; } } bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { *msg = "Invalid flag"; rc = 1; goto out; } if (negation) flags &= ~bit; else flags |= bit; } zor->zor_flags = flags; zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end); out: free(dup); return (rc); } static void dump_objset(objset_t *os) { dmu_objset_stats_t dds = { 0 }; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[ZFS_MAX_DATASET_NAME_LEN]; const char *type = "UNKNOWN"; int verbosity = dump_opt['d']; boolean_t print_header; unsigned i; int error; uint64_t total_slots_used = 0; uint64_t max_slot_used = 0; uint64_t dnode_slots; uint64_t obj_start; uint64_t obj_end; uint64_t flags; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); print_header = B_TRUE; if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); if (verbosity >= 4) { (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); (void) snprintf_blkptr(blkbuf + strlen(blkbuf), sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } dmu_objset_name(os, osname); (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " "%s, %llu objects%s%s\n", osname, type, (u_longlong_t)dmu_objset_id(os), (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf, (dds.dds_inconsistent) ? " (inconsistent)" : ""); for (i = 0; i < zopt_object_args; i++) { obj_start = zopt_object_ranges[i].zor_obj_start; obj_end = zopt_object_ranges[i].zor_obj_end; flags = zopt_object_ranges[i].zor_flags; object = obj_start; if (object == 0 || obj_start == obj_end) dump_object(os, object, verbosity, &print_header, NULL, flags); else object--; while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && object <= obj_end) { dump_object(os, object, verbosity, &print_header, NULL, flags); } } if (zopt_object_args > 0) { (void) printf("\n"); return; } if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); if (verify_dd_livelist(os) != 0) fatal("livelist is incorrect"); } if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); } count_ds_mos_objects(ds); } if (dmu_objset_ds(os) != NULL) dump_bookmarks(os, verbosity); if (verbosity < 2) return; if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header, NULL, 0); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, NULL, 0); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, NULL, 0); } if (DMU_PROJECTUSED_DNODE(os) != NULL && DMU_PROJECTUSED_DNODE(os)->dn_type != 0) dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, &print_header, NULL, 0); object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { dump_object(os, object, verbosity, &print_header, &dnode_slots, 0); object_count++; total_slots_used += dnode_slots; max_slot_used = object + dnode_slots - 1; } (void) printf("\n"); (void) printf(" Dnode slots:\n"); (void) printf("\tTotal used: %10llu\n", (u_longlong_t)total_slots_used); (void) printf("\tMax used: %10llu\n", (u_longlong_t)max_slot_used); (void) printf("\tPercent empty: %10lf\n", (double)(max_slot_used - total_slots_used)*100 / (double)max_slot_used); (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } ASSERT3U(object_count, ==, usedobjs); if (leaked_objects != 0) { (void) printf("%d potentially leaked objects detected\n", leaked_objects); leaked_objects = 0; } } static void dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; (void) printf("%s", header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (MMP_VALID(ub)) { (void) printf("\tmmp_delay = %0llu\n", (u_longlong_t)ub->ub_mmp_delay); if (MMP_SEQ_VALID(ub)) (void) printf("\tmmp_seq = %u\n", (unsigned int) MMP_SEQ(ub)); if (MMP_FAIL_INT_VALID(ub)) (void) printf("\tmmp_fail = %u\n", (unsigned int) MMP_FAIL_INT(ub)); if (MMP_INTERVAL_VALID(ub)) (void) printf("\tmmp_write = %u\n", (unsigned int) MMP_INTERVAL(ub)); /* After MMP_* to make summarize_uberblock_mmp cleaner */ (void) printf("\tmmp_valid = %x\n", (unsigned int) ub->ub_mmp_config & 0xFF); } if (dump_opt['u'] >= 4) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf("\tcheckpoint_txg = %llu\n", (u_longlong_t)ub->ub_checkpoint_txg); (void) printf("%s", footer ? footer : ""); } static void dump_config(spa_t *spa) { dmu_buf_t *db; size_t nvsize = 0; int error = 0; error = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object, FTAG, &db); if (error == 0) { nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); dump_packed_nvlist(spa->spa_meta_objset, spa->spa_config_object, (void *)&nvsize, 1); } else { (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", (u_longlong_t)spa->spa_config_object, error); } } static void dump_cachefile(const char *cachefile) { int fd; struct stat64 statbuf; char *buf; nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", cachefile, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", cachefile, strerror(errno)); exit(1); } if ((buf = malloc(statbuf.st_size)) == NULL) { (void) fprintf(stderr, "failed to allocate %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) fprintf(stderr, "failed to read %llu bytes\n", (u_longlong_t)statbuf.st_size); exit(1); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { (void) fprintf(stderr, "failed to unpack nvlist\n"); exit(1); } free(buf); dump_nvlist(config, 0); nvlist_free(config); } /* * ZFS label nvlist stats */ typedef struct zdb_nvl_stats { int zns_list_count; int zns_leaf_count; size_t zns_leaf_largest; size_t zns_leaf_total; nvlist_t *zns_string; nvlist_t *zns_uint64; nvlist_t *zns_boolean; } zdb_nvl_stats_t; static void collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) { nvlist_t *list, **array; nvpair_t *nvp = NULL; char *name; uint_t i, items; stats->zns_list_count++; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { name = nvpair_name(nvp); switch (nvpair_type(nvp)) { case DATA_TYPE_STRING: fnvlist_add_string(stats->zns_string, name, fnvpair_value_string(nvp)); break; case DATA_TYPE_UINT64: fnvlist_add_uint64(stats->zns_uint64, name, fnvpair_value_uint64(nvp)); break; case DATA_TYPE_BOOLEAN: fnvlist_add_boolean(stats->zns_boolean, name); break; case DATA_TYPE_NVLIST: if (nvpair_value_nvlist(nvp, &list) == 0) collect_nvlist_stats(list, stats); break; case DATA_TYPE_NVLIST_ARRAY: if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) break; for (i = 0; i < items; i++) { collect_nvlist_stats(array[i], stats); /* collect stats on leaf vdev */ if (strcmp(name, "children") == 0) { size_t size; (void) nvlist_size(array[i], &size, NV_ENCODE_XDR); stats->zns_leaf_total += size; if (size > stats->zns_leaf_largest) stats->zns_leaf_largest = size; stats->zns_leaf_count++; } } break; default: (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); } } } static void dump_nvlist_stats(nvlist_t *nvl, size_t cap) { zdb_nvl_stats_t stats = { 0 }; size_t size, sum = 0, total; size_t noise; /* requires nvlist with non-unique names for stat collection */ VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); (void) printf("\n\nZFS Label NVList Config Stats:\n"); VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", (int)total, (int)(cap - total), 100.0 * total / cap); collect_nvlist_stats(nvl, &stats); VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", (int)fnvlist_num_pairs(stats.zns_uint64), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", (int)fnvlist_num_pairs(stats.zns_string), (int)size, 100.0 * size / total); VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); size -= noise; sum += size; (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", (int)fnvlist_num_pairs(stats.zns_boolean), (int)size, 100.0 * size / total); size = total - sum; /* treat remainder as nvlist overhead */ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", stats.zns_list_count, (int)size, 100.0 * size / total); if (stats.zns_leaf_count > 0) { size_t average = stats.zns_leaf_total / stats.zns_leaf_count; (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", stats.zns_leaf_count, (int)average); (void) printf("%24d bytes largest\n", (int)stats.zns_leaf_largest); if (dump_opt['l'] >= 3 && average > 0) (void) printf(" space for %d additional leaf vdevs\n", (int)((cap - total) / average)); } (void) printf("\n"); nvlist_free(stats.zns_string); nvlist_free(stats.zns_uint64); nvlist_free(stats.zns_boolean); } typedef struct cksum_record { zio_cksum_t cksum; boolean_t labels[VDEV_LABELS]; avl_node_t link; } cksum_record_t; static int cksum_record_compare(const void *x1, const void *x2) { const cksum_record_t *l = (cksum_record_t *)x1; const cksum_record_t *r = (cksum_record_t *)x2; int arraysize = ARRAY_SIZE(l->cksum.zc_word); int difference; for (int i = 0; i < arraysize; i++) { difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); if (difference) break; } return (difference); } static cksum_record_t * cksum_record_alloc(zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); rec->cksum = *cksum; rec->labels[l] = B_TRUE; return (rec); } static cksum_record_t * cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) { cksum_record_t lookup = { .cksum = *cksum }; avl_index_t where; return (avl_find(tree, &lookup, &where)); } static cksum_record_t * cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) { cksum_record_t *rec; rec = cksum_record_lookup(tree, cksum); if (rec) { rec->labels[l] = B_TRUE; } else { rec = cksum_record_alloc(cksum, l); avl_add(tree, rec); } return (rec); } static int first_label(cksum_record_t *rec) { for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i]) return (i); return (-1); } static void print_label_numbers(char *prefix, cksum_record_t *rec) { printf("%s", prefix); for (int i = 0; i < VDEV_LABELS; i++) if (rec->labels[i] == B_TRUE) printf("%d ", i); printf("\n"); } #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) typedef struct zdb_label { vdev_label_t label; nvlist_t *config_nv; cksum_record_t *config; cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; boolean_t header_printed; boolean_t read_failed; } zdb_label_t; static void print_label_header(zdb_label_t *label, int l) { if (dump_opt['q']) return; if (label->header_printed == B_TRUE) return; (void) printf("------------------------------------\n"); (void) printf("LABEL %d\n", l); (void) printf("------------------------------------\n"); label->header_printed = B_TRUE; } static void print_l2arc_header(void) { (void) printf("------------------------------------\n"); (void) printf("L2ARC device header\n"); (void) printf("------------------------------------\n"); } static void print_l2arc_log_blocks(void) { (void) printf("------------------------------------\n"); (void) printf("L2ARC device log blocks\n"); (void) printf("------------------------------------\n"); } static void dump_l2arc_log_entries(uint64_t log_entries, l2arc_log_ent_phys_t *le, uint64_t i) { for (int j = 0; j < log_entries; j++) { dva_t dva = le[j].le_dva; (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " "vdev: %llu, offset: %llu\n", (u_longlong_t)i, j + 1, (u_longlong_t)DVA_GET_ASIZE(&dva), (u_longlong_t)DVA_GET_VDEV(&dva), (u_longlong_t)DVA_GET_OFFSET(&dva)); (void) printf("|\t\t\t\tbirth: %llu\n", (u_longlong_t)le[j].le_birth); (void) printf("|\t\t\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); (void) printf("|\t\t\t\tpsize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); (void) printf("|\t\t\t\tcompr: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); (void) printf("|\t\t\t\tcomplevel: %llu\n", (u_longlong_t)(&le[j])->le_complevel); (void) printf("|\t\t\t\ttype: %llu\n", (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); (void) printf("|\t\t\t\tprotected: %llu\n", (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); (void) printf("|\t\t\t\tprefetch: %llu\n", (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); (void) printf("|\t\t\t\taddress: %llu\n", (u_longlong_t)le[j].le_daddr); (void) printf("|\t\t\t\tARC state: %llu\n", (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); (void) printf("|\n"); } (void) printf("\n"); } static void dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) { (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr); (void) printf("|\t\tpayload_asize: %llu\n", (u_longlong_t)lbps.lbp_payload_asize); (void) printf("|\t\tpayload_start: %llu\n", (u_longlong_t)lbps.lbp_payload_start); (void) printf("|\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); (void) printf("|\t\tasize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); (void) printf("|\t\tcompralgo: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); (void) printf("|\t\tcksumalgo: %llu\n", (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop)); (void) printf("|\n\n"); } static void dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, l2arc_dev_hdr_phys_t *rebuild) { l2arc_log_blk_phys_t this_lb; uint64_t asize; l2arc_log_blkptr_t lbps[2]; abd_t *abd; zio_cksum_t cksum; int failed = 0; l2arc_dev_t dev; if (!dump_opt['q']) print_l2arc_log_blocks(); bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); dev.l2ad_evict = l2dhdr.dh_evict; dev.l2ad_start = l2dhdr.dh_start; dev.l2ad_end = l2dhdr.dh_end; if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { /* no log blocks to read */ if (!dump_opt['q']) { (void) printf("No log blocks to read\n"); (void) printf("\n"); } return; } else { dev.l2ad_hand = lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); } dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); for (;;) { if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) break; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { if (!dump_opt['q']) { (void) printf("Error while reading next log " "block\n\n"); } break; } fletcher_4_native_varsize(&this_lb, asize, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { failed++; if (!dump_opt['q']) { (void) printf("Invalid cksum\n"); dump_l2arc_log_blkptr(lbps[0]); } break; } switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { case ZIO_COMPRESS_OFF: break; default: abd = abd_alloc_for_io(asize, B_TRUE); abd_copy_from_buf_off(abd, &this_lb, 0, asize); zio_decompress_data(L2BLK_GET_COMPRESS( (&lbps[0])->lbp_prop), abd, &this_lb, asize, sizeof (this_lb), NULL); abd_free(abd); break; } if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(&this_lb, sizeof (this_lb)); if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { if (!dump_opt['q']) (void) printf("Invalid log block magic\n\n"); break; } rebuild->dh_lb_count++; rebuild->dh_lb_asize += asize; if (dump_opt['l'] > 1 && !dump_opt['q']) { (void) printf("lb[%4llu]\tmagic: %llu\n", (u_longlong_t)rebuild->dh_lb_count, (u_longlong_t)this_lb.lb_magic); dump_l2arc_log_blkptr(lbps[0]); } if (dump_opt['l'] > 2 && !dump_opt['q']) dump_l2arc_log_entries(l2dhdr.dh_log_entries, this_lb.lb_entries, rebuild->dh_lb_count); if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev.l2ad_evict) && !dev.l2ad_first) break; lbps[0] = lbps[1]; lbps[1] = this_lb.lb_prev_lbp; } if (!dump_opt['q']) { (void) printf("log_blk_count:\t %llu with valid cksum\n", (u_longlong_t)rebuild->dh_lb_count); (void) printf("\t\t %d with invalid cksum\n", failed); (void) printf("log_blk_asize:\t %llu\n\n", (u_longlong_t)rebuild->dh_lb_asize); } } static int dump_l2arc_header(int fd) { l2arc_dev_hdr_phys_t l2dhdr, rebuild; int error = B_FALSE; bzero(&l2dhdr, sizeof (l2dhdr)); bzero(&rebuild, sizeof (rebuild)); if (pread64(fd, &l2dhdr, sizeof (l2dhdr), VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { error = B_TRUE; } else { if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) error = B_TRUE; } if (error) { (void) printf("L2ARC device header not found\n\n"); /* Do not return an error here for backward compatibility */ return (0); } else if (!dump_opt['q']) { print_l2arc_header(); (void) printf(" magic: %llu\n", (u_longlong_t)l2dhdr.dh_magic); (void) printf(" version: %llu\n", (u_longlong_t)l2dhdr.dh_version); (void) printf(" pool_guid: %llu\n", (u_longlong_t)l2dhdr.dh_spa_guid); (void) printf(" flags: %llu\n", (u_longlong_t)l2dhdr.dh_flags); (void) printf(" start_lbps[0]: %llu\n", (u_longlong_t) l2dhdr.dh_start_lbps[0].lbp_daddr); (void) printf(" start_lbps[1]: %llu\n", (u_longlong_t) l2dhdr.dh_start_lbps[1].lbp_daddr); (void) printf(" log_blk_ent: %llu\n", (u_longlong_t)l2dhdr.dh_log_entries); (void) printf(" start: %llu\n", (u_longlong_t)l2dhdr.dh_start); (void) printf(" end: %llu\n", (u_longlong_t)l2dhdr.dh_end); (void) printf(" evict: %llu\n", (u_longlong_t)l2dhdr.dh_evict); (void) printf(" lb_asize_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_asize); (void) printf(" lb_count_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_count); (void) printf(" trim_action_time: %llu\n", (u_longlong_t)l2dhdr.dh_trim_action_time); (void) printf(" trim_state: %llu\n\n", (u_longlong_t)l2dhdr.dh_trim_state); } dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); /* * The total aligned size of log blocks and the number of log blocks * reported in the header of the device may be less than what zdb * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). * This happens because dump_l2arc_log_blocks() lacks the memory * pressure valve that l2arc_rebuild() has. Thus, if we are on a system * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize * and dh_lb_count will be lower to begin with than what exists on the * device. This is normal and zdb should not exit with an error. The * opposite case should never happen though, the values reported in the * header should never be higher than what dump_l2arc_log_blocks() and * l2arc_rebuild() report. If this happens there is a leak in the * accounting of log blocks. */ if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || l2dhdr.dh_lb_count > rebuild.dh_lb_count) return (1); return (0); } static void dump_config_from_label(zdb_label_t *label, size_t buflen, int l) { if (dump_opt['q']) return; if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) return; print_label_header(label, l); dump_nvlist(label->config_nv, 4); print_label_numbers(" labels = ", label->config); if (dump_opt['l'] >= 2) dump_nvlist_stats(label->config_nv, buflen); } #define ZDB_MAX_UB_HEADER_SIZE 32 static void dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) { vdev_t vd; char header[ZDB_MAX_UB_HEADER_SIZE]; vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)&label->label + uoff); cksum_record_t *rec = label->uberblocks[i]; if (rec == NULL) { if (dump_opt['u'] >= 2) { print_label_header(label, label_num); (void) printf(" Uberblock[%d] invalid\n", i); } continue; } if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) continue; if ((dump_opt['u'] < 4) && (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) continue; print_label_header(label, label_num); (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, " Uberblock[%d]\n", i); dump_uberblock(ub, header, ""); print_label_numbers(" labels = ", rec); } } static char curpath[PATH_MAX]; /* * Iterate through the path components, recursively passing * current one's obj and remaining path until we find the obj * for the last one. */ static int dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj) { int err; boolean_t header = B_TRUE; uint64_t child_obj; char *s; dmu_buf_t *db; dmu_object_info_t doi; if ((s = strchr(name, '/')) != NULL) *s = '\0'; err = zap_lookup(os, obj, name, 8, 1, &child_obj); (void) strlcat(curpath, name, sizeof (curpath)); if (err != 0) { (void) fprintf(stderr, "failed to lookup %s: %s\n", curpath, strerror(err)); return (err); } child_obj = ZFS_DIRENT_OBJ(child_obj); err = sa_buf_hold(os, child_obj, FTAG, &db); if (err != 0) { (void) fprintf(stderr, "failed to get SA dbuf for obj %llu: %s\n", (u_longlong_t)child_obj, strerror(err)); return (EINVAL); } dmu_object_info_from_db(db, &doi); sa_buf_rele(db, FTAG); if (doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) { (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", doi.doi_bonus_type, (u_longlong_t)child_obj); return (EINVAL); } if (dump_opt['v'] > 6) { (void) printf("obj=%llu %s type=%d bonustype=%d\n", (u_longlong_t)child_obj, curpath, doi.doi_type, doi.doi_bonus_type); } (void) strlcat(curpath, "/", sizeof (curpath)); switch (doi.doi_type) { case DMU_OT_DIRECTORY_CONTENTS: if (s != NULL && *(s + 1) != '\0') return (dump_path_impl(os, child_obj, s + 1, retobj)); fallthrough; case DMU_OT_PLAIN_FILE_CONTENTS: if (retobj != NULL) { *retobj = child_obj; } else { dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0); } return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " "type %d\n", (u_longlong_t)obj, doi.doi_type); break; } return (EINVAL); } /* * Dump the blocks for the object specified by path inside the dataset. */ static int dump_path(char *ds, char *path, uint64_t *retobj) { int err; objset_t *os; uint64_t root_obj; err = open_objset(ds, FTAG, &os); if (err != 0) return (err); err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); if (err != 0) { (void) fprintf(stderr, "can't lookup root znode: %s\n", strerror(err)); close_objset(os, FTAG); return (EINVAL); } (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); err = dump_path_impl(os, root_obj, path, retobj); close_objset(os, FTAG); return (err); } static int zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) { int err = 0; uint64_t size, readsize, oursize, offset; ssize_t writesize; sa_handle_t *hdl; (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj, destfile); VERIFY3P(os, ==, sa_os); if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) { (void) printf("Failed to get handle for SA znode\n"); return (err); } if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) { (void) sa_handle_destroy(hdl); return (err); } (void) sa_handle_destroy(hdl); (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj, size); if (size == 0) { return (EINVAL); } int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); /* * We cap the size at 1 mebibyte here to prevent * allocation failures and nigh-infinite printing if the * object is extremely large. */ oursize = MIN(size, 1 << 20); offset = 0; char *buf = kmem_alloc(oursize, KM_NOSLEEP); if (buf == NULL) { return (ENOMEM); } while (offset < size) { readsize = MIN(size - offset, 1 << 20); err = dmu_read(os, srcobj, offset, readsize, buf, 0); if (err != 0) { (void) printf("got error %u from dmu_read\n", err); kmem_free(buf, oursize); return (err); } if (dump_opt['v'] > 3) { (void) printf("Read offset=%" PRIu64 " size=%" PRIu64 " error=%d\n", offset, readsize, err); } writesize = write(fd, buf, readsize); if (writesize < 0) { err = errno; break; } else if (writesize != readsize) { /* Incomplete write */ (void) fprintf(stderr, "Short write, only wrote %llu of" " %" PRIu64 " bytes, exiting...\n", (u_longlong_t)writesize, readsize); break; } offset += readsize; } (void) close(fd); if (buf != NULL) kmem_free(buf, oursize); return (err); } static int dump_label(const char *dev) { char path[MAXPATHLEN]; zdb_label_t labels[VDEV_LABELS]; uint64_t psize, ashift, l2cache; struct stat64 statbuf; boolean_t config_found = B_FALSE; boolean_t error = B_FALSE; boolean_t read_l2arc_header = B_FALSE; avl_tree_t config_tree; avl_tree_t uberblock_tree; void *node, *cookie; int fd; bzero(labels, sizeof (labels)); /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(path, dev, sizeof (path)); if (dev[0] != '/' && stat64(path, &statbuf) != 0) { int error; error = zfs_resolve_shortname(dev, path, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(path)) { if (zfs_append_partition(path, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat64(path, &statbuf) != 0)) { (void) printf("failed to find device %s, try " "specifying absolute path instead\n", dev); return (1); } } if ((fd = open64(path, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", path, strerror(errno)); exit(1); } if (fstat64_blk(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); (void) close(fd); exit(1); } if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) (void) printf("failed to invalidate cache '%s' : %s\n", path, strerror(errno)); avl_create(&config_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); avl_create(&uberblock_tree, cksum_record_compare, sizeof (cksum_record_t), offsetof(cksum_record_t, link)); psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); ashift = SPA_MINBLOCKSHIFT; /* * 1. Read the label from disk * 2. Unpack the configuration and insert in config tree. * 3. Traverse all uberblocks and insert in uberblock tree. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; char *buf = label->label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); nvlist_t *config; cksum_record_t *rec; zio_cksum_t cksum; vdev_t vd; if (pread64(fd, &label->label, sizeof (label->label), vdev_label_offset(psize, l, 0)) != sizeof (label->label)) { if (!dump_opt['q']) (void) printf("failed to read label %d\n", l); label->read_failed = B_TRUE; error = B_TRUE; continue; } label->read_failed = B_FALSE; if (nvlist_unpack(buf, buflen, &config, 0) == 0) { nvlist_t *vdev_tree = NULL; size_t size; if ((nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) ashift = SPA_MINBLOCKSHIFT; if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; /* If the device is a cache device clear the header. */ if (!read_l2arc_header) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && l2cache == POOL_STATE_L2CACHE) { read_l2arc_header = B_TRUE; } } fletcher_4_native_varsize(buf, size, &cksum); rec = cksum_record_insert(&config_tree, &cksum, l); label->config = rec; label->config_nv = config; config_found = B_TRUE; } else { error = B_TRUE; } vd.vdev_ashift = ashift; vd.vdev_top = &vd; for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); uberblock_t *ub = (void *)((char *)label + uoff); if (uberblock_verify(ub)) continue; fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); rec = cksum_record_insert(&uberblock_tree, &cksum, l); label->uberblocks[i] = rec; } } /* * Dump the label and uberblocks. */ for (int l = 0; l < VDEV_LABELS; l++) { zdb_label_t *label = &labels[l]; size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); if (label->read_failed == B_TRUE) continue; if (label->config_nv) { dump_config_from_label(label, buflen, l); } else { if (!dump_opt['q']) (void) printf("failed to unpack label %d\n", l); } if (dump_opt['u']) dump_label_uberblocks(label, ashift, l); nvlist_free(label->config_nv); } /* * Dump the L2ARC header, if existent. */ if (read_l2arc_header) error |= dump_l2arc_header(fd); cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); cookie = NULL; while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) umem_free(node, sizeof (cksum_record_t)); avl_destroy(&config_tree); avl_destroy(&uberblock_tree); (void) close(fd); return (config_found == B_FALSE ? 2 : (error == B_TRUE ? 1 : 0)); } static uint64_t dataset_feature_count[SPA_FEATURES]; static uint64_t global_feature_count[SPA_FEATURES]; static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ static int dump_one_objset(const char *dsname, void *arg) { int error; objset_t *os; spa_feature_t f; error = open_objset(dsname, FTAG, &os); if (error != 0) return (0); for (f = 0; f < SPA_FEATURES; f++) { if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) continue; ASSERT(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); dataset_feature_count[f]++; } if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { remap_deadlist_count++; } for (dsl_bookmark_node_t *dbn = avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); if (dbn->dbn_phys.zbm_redaction_obj != 0) global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; } if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && !dmu_objset_is_snapshot(os)) { global_feature_count[SPA_FEATURE_LIVELIST]++; } dump_objset(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); } /* * Block statistics. */ #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; uint64_t zb_psize; uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; uint64_t zb_ditto_same_ms; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* * Extended object types to report deferred frees and dedup auto-ditto blocks. */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static const char *zdb_ot_extname[] = { "deferred free", "dedup ditto", "other", "Total", }; #define ZB_TOTAL DN_MAX_LEVELS #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; uint64_t zcb_psize_total; uint64_t zcb_lsize_total; uint64_t zcb_asize_total; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE + 1]; uint64_t zcb_start; hrtime_t zcb_lastprint; uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ static boolean_t same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) { vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t ms_shift = vd->vdev_ms_shift; return ((off1 >> ms_shift) == (off2 >> ms_shift)); } /* * Used to simplify reporting of the histogram data. */ typedef struct one_histo { char *name; uint64_t *count; uint64_t *len; uint64_t cumulative; } one_histo_t; /* * The number of separate histograms processed for psize, lsize and asize. */ #define NUM_HISTO 3 /* * This routine will create a fixed column size output of three different * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M * the count, length and cumulative length of the psize, lsize and * asize blocks. * * All three types of blocks are listed on a single line * * By default the table is printed in nicenumber format (e.g. 123K) but * if the '-P' parameter is specified then the full raw number (parseable) * is printed out. */ static void dump_size_histograms(zdb_cb_t *zcb) { /* * A temporary buffer that allows us to convert a number into * a string using zdb_nicenumber to allow either raw or human * readable numbers to be output. */ char numbuf[32]; /* * Define titles which are used in the headers of the tables * printed by this routine. */ const char blocksize_title1[] = "block"; const char blocksize_title2[] = "size"; const char count_title[] = "Count"; const char length_title[] = "Size"; const char cumulative_title[] = "Cum."; /* * Setup the histogram arrays (psize, lsize, and asize). */ one_histo_t parm_histo[NUM_HISTO]; parm_histo[0].name = "psize"; parm_histo[0].count = zcb->zcb_psize_count; parm_histo[0].len = zcb->zcb_psize_len; parm_histo[0].cumulative = 0; parm_histo[1].name = "lsize"; parm_histo[1].count = zcb->zcb_lsize_count; parm_histo[1].len = zcb->zcb_lsize_len; parm_histo[1].cumulative = 0; parm_histo[2].name = "asize"; parm_histo[2].count = zcb->zcb_asize_count; parm_histo[2].len = zcb->zcb_asize_len; parm_histo[2].cumulative = 0; (void) printf("\nBlock Size Histogram\n"); /* * Print the first line titles */ if (dump_opt['P']) (void) printf("\n%s\t", blocksize_title1); else (void) printf("\n%7s ", blocksize_title1); for (int j = 0; j < NUM_HISTO; j++) { if (dump_opt['P']) { if (j < NUM_HISTO - 1) { (void) printf("%s\t\t\t", parm_histo[j].name); } else { /* Don't print trailing spaces */ (void) printf(" %s", parm_histo[j].name); } } else { if (j < NUM_HISTO - 1) { /* Left aligned strings in the output */ (void) printf("%-7s ", parm_histo[j].name); } else { /* Don't print trailing spaces */ (void) printf("%s", parm_histo[j].name); } } } (void) printf("\n"); /* * Print the second line titles */ if (dump_opt['P']) { (void) printf("%s\t", blocksize_title2); } else { (void) printf("%7s ", blocksize_title2); } for (int i = 0; i < NUM_HISTO; i++) { if (dump_opt['P']) { (void) printf("%s\t%s\t%s\t", count_title, length_title, cumulative_title); } else { (void) printf("%7s%7s%7s", count_title, length_title, cumulative_title); } } (void) printf("\n"); /* * Print the rows */ for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { /* * Print the first column showing the blocksize */ zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); if (dump_opt['P']) { printf("%s", numbuf); } else { printf("%7s:", numbuf); } /* * Print the remaining set of 3 columns per size: * for psize, lsize and asize */ for (int j = 0; j < NUM_HISTO; j++) { parm_histo[j].cumulative += parm_histo[j].len[i]; zdb_nicenum(parm_histo[j].count[i], numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); zdb_nicenum(parm_histo[j].len[i], numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); zdb_nicenum(parm_histo[j].cumulative, numbuf, sizeof (numbuf)); if (dump_opt['P']) (void) printf("\t%s", numbuf); else (void) printf("%7s", numbuf); } (void) printf("\n"); } } static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; int i; ASSERT(type < ZDB_OT_TOTAL); if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; /* * The histogram is only big enough to record blocks up to * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, * "other", bucket. */ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) { zb->zb_ditto_samevdev++; if (same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; } break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal != 0) { zb->zb_ditto_samevdev++; if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[1]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; else if (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2]) && same_metaslab(zcb->zcb_spa, DVA_GET_VDEV(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[1]), DVA_GET_OFFSET(&bp->blk_dva[2]))) zb->zb_ditto_same_ms++; } break; } } spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] [BPE_GET_PSIZE(bp)]++; return; } /* * The binning histogram bins by powers of two up to * SPA_MAXBLOCKSIZE rather than creating bins for * every possible blocksize found in the pool. */ int bin = highbit64(BP_GET_PSIZE(bp)) - 1; zcb->zcb_psize_count[bin]++; zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); zcb->zcb_psize_total += BP_GET_PSIZE(bp); bin = highbit64(BP_GET_LSIZE(bp)) - 1; zcb->zcb_lsize_count[bin]++; zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); zcb->zcb_lsize_total += BP_GET_LSIZE(bp); bin = highbit64(BP_GET_ASIZE(bp)) - 1; zcb->zcb_asize_count[bin]++; zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); if (dump_opt['L']) return; if (BP_GET_DEDUP(bp)) { ddt_t *ddt; ddt_entry_t *dde; ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { refcnt = 0; } else { ddt_phys_t *ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); refcnt = ddp->ddp_refcnt; if (ddt_phys_total_refcnt(dde) == 0) ddt_remove(ddt, dde); } ddt_exit(ddt); } VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } static void zdb_blkptr_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; int ioerr = zio->io_error; zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { char blkbuf[BP_SPRINTF_LEN]; zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; (void) printf("zdb_blkptr_cb: " "Got error %d reading " "<%llu, %llu, %lld, %llx> %s -- skipping\n", ioerr, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, blkbuf); } mutex_exit(&spa->spa_scrub_lock); abd_free(zio->io_abd); } static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; dmu_object_type_t type; boolean_t is_metadata; if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); type = BP_GET_TYPE(bp); zdb_count_block(zcb, zilog, bp, (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes > max_inflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; /* only call gethrtime() every 100 blocks */ static int iters; if (++iters > 100) iters = 0; else return (0); if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; - int kb_per_sec = + uint64_t kb_per_sec = 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); - int sec_remaining = + uint64_t sec_remaining = (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; /* make sure nicenum has enough space */ CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, - "\r%5s completed (%4dMB/s) " - "estimated time remaining: %uhr %02umin %02usec ", + "\r%5s completed (%4"PRIu64"MB/s) " + "estimated time remaining: " + "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ", buf, kb_per_sec / 1024, sec_remaining / 60 / 60, sec_remaining / 60 % 60, sec_remaining % 60); zcb->zcb_lastprint = now; } return (0); } static void zdb_leak(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; /* ARGSUSED */ static int load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { spa_vdev_removal_t *svr = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; /* skip vdevs we don't care about */ if (sme->sme_vdev != svr->svr_vdev_id) return (0); vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (sme->sme_type == SM_ALLOC) range_tree_add(svr->svr_allocd_segs, offset, size); else range_tree_remove(svr->svr_allocd_segs, offset, size); return (0); } /* ARGSUSED */ static void claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { /* * This callback was called through a remap from * a device being removed. Therefore, the vdev that * this callback is applied to is a concrete * vdev. */ ASSERT(vdev_is_concrete(vd)); VERIFY0(metaslab_claim_impl(vd, offset, size, spa_min_claim_txg(vd->vdev_spa))); } static void claim_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_ops.vdev_op_remap(vd, offset, size, claim_segment_impl_cb, NULL); } /* * After accounting for all allocated blocks that are directly referenced, * we might have missed a reference to a block from a partially complete * (and thus unused) indirect mapping object. We perform a secondary pass * through the metaslabs we have already mapped and claim the destination * blocks. */ static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return; if (spa->spa_vdev_removal == NULL) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT0(range_tree_space(svr->svr_allocd_segs)); range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; ASSERT0(range_tree_space(allocs)); if (msp->ms_sm != NULL) VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); } range_tree_destroy(allocs); iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); /* * Clear everything past what has been synced, * because we have not allocated mappings for * it yet. */ range_tree_clear(svr->svr_allocd_segs, vdev_indirect_mapping_max_offset(vim), vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ static int increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; ASSERT(!bp_freed); ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); vdev_indirect_mapping_increment_obsolete_count( vd->vdev_indirect_mapping, DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), zcb->zcb_vd_obsolete_counts[vd->vdev_id]); return (0); } static uint32_t * zdb_load_obsolete_counts(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint64_t obsolete_sm_object; uint32_t *counts; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); counts = vdev_indirect_mapping_load_obsolete_counts(vim); if (vd->vdev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, vd->vdev_obsolete_sm); } if (scip->scip_vdev == vd->vdev_id && scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); } return (counts); } static void zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { ddt_bookmark_t ddb; ddt_entry_t dde; int error; int p; ASSERT(!dump_opt['L']); bzero(&ddb, sizeof (ddb)); while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; if (ddb.ddb_class == DDT_CLASS_UNIQUE) return; ASSERT(ddt_phys_total_refcnt(&dde) > 1); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); zcb->zcb_dedup_blocks++; } } ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; ddt_enter(ddt); VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); ddt_exit(ddt); } ASSERT(error == ENOENT); } typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; } checkpoint_sm_exclude_entry_arg_t; static int checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) { checkpoint_sm_exclude_entry_arg_t *cseea = arg; vdev_t *vd = cseea->cseea_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); /* * Since the vdev_checkpoint_sm exists in the vdev level * and the ms_sm space maps exist in the metaslab level, * an entry in the checkpoint space map could theoretically * cross the boundaries of the metaslab that it belongs. * * In reality, because of the way that we populate and * manipulate the checkpoint's space maps currently, * there shouldn't be any entries that cross metaslabs. * Hence the assertion below. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * By removing the entry from the allocated segments we * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); cseea->cseea_checkpoint_size += sme->sme_run; return (0); } static void zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) { spa_t *spa = vd->vdev_spa; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; /* * If there is no vdev_top_zap, we are in a pool whose * version predates the pool checkpoint feature. */ if (vd->vdev_top_zap == 0) return; /* * If there is no reference of the vdev_checkpoint_sm in * the vdev_top_zap, then one of the following scenarios * is true: * * 1] There is no checkpoint * 2] There is a checkpoint, but no checkpointed blocks * have been freed yet * 3] The current vdev is indirect * * In these cases we return immediately. */ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) return; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); checkpoint_sm_exclude_entry_arg_t cseea; cseea.cseea_vd = vd; cseea.cseea_checkpoint_size = 0; VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; } static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); } } static int count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { int64_t *ualloc_space = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (sme->sme_type == SM_ALLOC) *ualloc_space += sme->sme_run; else *ualloc_space -= sme->sme_run; return (0); } static int64_t get_unflushed_alloc_space(spa_t *spa) { if (dump_opt['L']) return (0); int64_t ualloc_space = 0; iterate_through_spacemap_logs(spa, count_unflushed_space_cb, &ualloc_space); return (ualloc_space); } static int load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { maptype_t *uic_maptype = arg; uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint64_t vdev_id = sme->sme_vdev; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* skip indirect vdevs */ if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); if (*uic_maptype == sme->sme_type) range_tree_add(ms->ms_allocatable, offset, size); else range_tree_remove(ms->ms_allocatable, offset, size); return (0); } static void load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) { iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); } static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; ASSERT3U(i, ==, vd->vdev_id); if (vd->vdev_ops == &vdev_indirect_ops) continue; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; (void) fprintf(stderr, "\rloading concrete vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)msp->ms_id, (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, msp->ms_allocatable, maptype)); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } } load_unflushed_to_ms_allocatables(spa, maptype); } /* * vm_idxp is an in-out parameter which (for indirect vdevs) is the * index in vim_entries that has the first entry in this metaslab. * On return, it will be set to the first entry after this metaslab. */ static void load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, uint64_t *vim_idxp) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the * size-ordered tree, so clear the range_tree ops. */ msp->ms_allocatable->rt_ops = NULL; for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); (*vim_idxp)++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[*vim_idxp]; uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); ASSERT3U(ent_offset, >=, msp->ms_start); if (ent_offset >= msp->ms_start + msp->ms_size) break; /* * Mappings do not cross metaslab boundaries, * because we create them by walking the metaslabs. */ ASSERT3U(ent_offset + ent_len, <=, msp->ms_start + msp->ms_size); range_tree_add(msp->ms_allocatable, ent_offset, ent_len); } if (!msp->ms_loaded) msp->ms_loaded = B_TRUE; mutex_exit(&msp->ms_lock); } static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { ASSERT(!dump_opt['L']); vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; ASSERT3U(c, ==, vd->vdev_id); if (vd->vdev_ops != &vdev_indirect_ops) continue; /* * Note: we don't check for mapping leaks on * removing vdevs because their ms_allocatable's * are used to look for leaks in allocated space. */ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); /* * Normally, indirect vdevs don't have any * metaslabs. We want to set them up for * zio_claim(). */ vdev_metaslab_group_create(vd); VERIFY0(vdev_metaslab_init(vd, 0)); vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; uint64_t vim_idx = 0; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { (void) fprintf(stderr, "\rloading indirect vdev %llu, " "metaslab %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vd->vdev_ms[m]->ms_id, (longlong_t)vd->vdev_ms_count); load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], &vim_idx); } ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); } } static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (dump_opt['L']) return; dsl_pool_t *dp = spa->spa_dsl_pool; vdev_t *rvd = spa->spa_root_vdev; /* * We are going to be changing the meaning of the metaslab's * ms_allocatable. Ensure that the allocator doesn't try to * use the tree. */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), UMEM_NOFAIL); /* * For leak detection, we overload the ms_allocatable trees * to contain allocated segments instead of free segments. * As a result, we can't use the normal metaslab_load/unload * interfaces. */ zdb_leak_init_prepare_indirect_vdevs(spa, zcb); load_concrete_ms_allocatable_trees(spa, SM_ALLOC); /* * On load_concrete_ms_allocatable_trees() we loaded all the * allocated entries from the ms_sm to the ms_allocatable for * each metaslab. If the pool has a checkpoint or is in the * middle of discarding a checkpoint, some of these blocks * may have been freed but their ms_sm may not have been * updated because they are referenced by the checkpoint. In * order to avoid false-positives during leak-detection, we * go through the vdev's checkpoint space map and exclude all * its entries from their relevant ms_allocatable. * * We also aggregate the space held by the checkpoint and add * it to zcb_checkpoint_size. * * Note that at this point we are also verifying that all the * entries on the checkpoint_sm are marked as allocated in * the ms_sm of their relevant metaslab. * [see comment in checkpoint_sm_exclude_entry_cb()] */ zdb_leak_init_exclude_checkpoint(spa, zcb); ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); /* for cleaner progress output */ (void) fprintf(stderr, "\n"); if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); zdb_ddt_leak_init(spa, zcb); spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) { boolean_t leaks = B_FALSE; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t total_leaked = 0; boolean_t are_precise = B_FALSE; ASSERT(vim != NULL); for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { vdev_indirect_mapping_entry_phys_t *vimep = &vim->vim_entries[i]; uint64_t obsolete_bytes = 0; uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; /* * This is not very efficient but it's easy to * verify correctness. */ for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1 << vd->vdev_ashift) { if (range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1 << vd->vdev_ashift)) { obsolete_bytes += 1 << vd->vdev_ashift; } } int64_t bytes_leaked = obsolete_bytes - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { (void) printf("obsolete indirect mapping count " "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), (u_longlong_t)bytes_leaked); } total_leaked += ABS(bytes_leaked); } VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (!are_precise && total_leaked > 0) { int pct_leaked = total_leaked * 100 / vdev_indirect_mapping_bytes_mapped(vim); (void) printf("cannot verify obsolete indirect mapping " "counts of vdev %llu because precise feature was not " "enabled when it was removed: %d%% (%llx bytes) of mapping" "unreferenced\n", (u_longlong_t)vd->vdev_id, pct_leaked, (u_longlong_t)total_leaked); } else if (total_leaked > 0) { (void) printf("obsolete indirect mapping count mismatch " "for vdev %llu -- %llx total bytes mismatched\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)total_leaked); leaks |= B_TRUE; } vdev_indirect_mapping_free_obsolete_counts(vim, zcb->zcb_vd_obsolete_counts[vd->vdev_id]); zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; return (leaks); } static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { if (dump_opt['L']) return (B_FALSE); boolean_t leaks = B_FALSE; vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (zcb->zcb_vd_obsolete_counts[c] != NULL) { leaks |= zdb_check_for_obsolete_leaks(vd, zcb); } for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == spa_embedded_log_class(spa)) ? vd->vdev_log_mg : vd->vdev_mg); /* * ms_allocatable has been overloaded * to contain allocated segments. Now that * we finished traversing all blocks, any * block that remains in the ms_allocatable * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed * from the ms_allocatable. For indirect * vdevs, space remaining in the tree * represents parts of the mapping that are * not referenced, which is not a bug. */ if (vd->vdev_ops == &vdev_indirect_ops) { range_tree_vacate(msp->ms_allocatable, NULL, NULL); } else { range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } } } umem_free(zcb->zcb_vd_obsolete_counts, rvd->vdev_children * sizeof (uint32_t *)); zcb->zcb_vd_obsolete_counts = NULL; return (leaks); } /* ARGSUSED */ static int count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zdb_cb_t *zcb = arg; if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); return (0); } /* * Iterate over livelists which have been destroyed by the user but * are still present in the MOS, waiting to be freed */ static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) { objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; ASSERT0(err); zap_cursor_t zc; zap_attribute_t attr; dsl_deadlist_t ll; /* NULL out os prior to dsl_deadlist_open in case it's garbage */ ll.dl_os = NULL; for (zap_cursor_init(&zc, mos, zap_obj); zap_cursor_retrieve(&zc, &attr) == 0; (void) zap_cursor_advance(&zc)) { dsl_deadlist_open(&ll, mos, attr.za_first_integer); func(&ll, arg); dsl_deadlist_close(&ll); } zap_cursor_fini(&zc); } static int bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (count_block_cb(arg, bp, tx)); } static int livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) { zdb_cb_t *zbc = args; bplist_t blks; bplist_create(&blks); /* determine which blocks have been alloc'd but not freed */ VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); /* count those blocks */ (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); bplist_destroy(&blks); return (0); } static void livelist_count_blocks(dsl_deadlist_t *ll, void *arg) { dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); } /* * Count the blocks in the livelists that have been destroyed by the user * but haven't yet been freed. */ static void deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) { iterate_deleted_livelists(spa, livelist_count_blocks, zbc); } static void dump_livelist_cb(dsl_deadlist_t *ll, void *arg) { ASSERT3P(arg, ==, NULL); global_feature_count[SPA_FEATURE_LIVELIST]++; dump_blkptr_list(ll, "Deleted Livelist"); dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); } /* * Print out, register object references to, and increment feature counts for * livelists that have been destroyed by the user but haven't yet been freed. */ static void deleted_livelists_dump_mos(spa_t *spa) { uint64_t zap_obj; objset_t *mos = spa->spa_meta_objset; int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (err == ENOENT) return; mos_obj_refd(zap_obj); iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; boolean_t leaks = B_FALSE; int e, c, err; bp_embedded_type_t i; bzero(&zcb, sizeof (zcb)); (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); /* * When leak detection is enabled we load all space maps as SM_ALLOC * maps, then traverse the pool claiming each block we discover. If * the pool is perfectly consistent, the segment trees will be empty * when we're done. Anything left over is a leak; any block we can't * claim (because it's not part of any space map) is a double * allocation, reference to a freed block, or an unclaimed log block. * * When leak detection is disabled (-L option) we still traverse the * pool claiming each block we discover, but we skip opening any space * maps. */ bzero(&zcb, sizeof (zdb_cb_t)); zdb_leak_init(spa, &zcb); /* * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, bpobj_count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, bpobj_count_block_cb, &zcb, NULL); } zdb_claim_removing(spa, &zcb); if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, &zcb, NULL)); } deleted_livelists_count_blocks(spa, &zcb); if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb.zcb_totalasize += metaslab_class_get_alloc(spa_embedded_log_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those * I/Os to complete. We leverage "The Godfather" zio to wait on * all async I/Os to complete. */ if (dump_opt['c']) { for (c = 0; c < max_ncpus; c++) { (void) zio_wait(spa->spa_async_zio_root[c]); spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } } ASSERT0(spa->spa_load_verify_bytes); /* * Done after zio_wait() since zcb_haderrors is modified in * zdb_blkptr_done() */ zcb.zcb_haderrors |= err; if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); for (e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); } } } /* * Report any leaked segments. */ leaks |= zdb_leak_fini(spa, &zcb); tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else if (!dump_opt['L']) { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); leaks = B_TRUE; } if (tzb->zb_count == 0) return (2); (void) printf("\n"); (void) printf("\t%-16s %14llu\n", "bp count:", (u_longlong_t)tzb->zb_count); (void) printf("\t%-16s %14llu\n", "ganged count:", (longlong_t)tzb->zb_gangs); (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp physical:", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", "bp allocated:", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_special_class(spa)); uint64_t space = metaslab_class_get_space( spa_special_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Special class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_dedup_class(spa)); uint64_t space = metaslab_class_get_space( spa_dedup_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Dedup class", (u_longlong_t)alloc, 100.0 * alloc / space); } if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_embedded_log_class(spa)); uint64_t space = metaslab_class_get_space( spa_embedded_log_class(spa)); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Embedded log class", (u_longlong_t)alloc, 100.0 * alloc / space); } for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; (void) printf("\n"); (void) printf("\tadditional, non-pointer bps of type %u: " "%10llu\n", i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); if (dump_opt['b'] >= 3) { (void) printf("\t number of (compressed) bytes: " "number of bps\n"); dump_histogram(zcb.zcb_embedded_histogram[i], sizeof (zcb.zcb_embedded_histogram[i]) / sizeof (zcb.zcb_embedded_histogram[i][0]), 0); } } if (tzb->zb_ditto_samevdev != 0) { (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } if (tzb->zb_ditto_same_ms != 0) { (void) printf("\tDittoed blocks in same metaslab: %llu\n", (longlong_t)tzb->zb_ditto_same_ms); } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; if (vim == NULL) { continue; } char mem[32]; zdb_nicenum(vdev_indirect_mapping_num_entries(vim), mem, vdev_indirect_mapping_size(vim)); (void) printf("\tindirect vdev id %llu has %llu segments " "(%s in memory)\n", (longlong_t)vd->vdev_id, (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); } if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { char csize[32], lsize[32], psize[32], asize[32]; char avg[32], gang[32]; const char *typename; /* make sure nicenum has enough space */ CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); if (t < DMU_OT_NUMTYPES) typename = dmu_ot[t].ot_name; else typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" "\t%5s\t%5s\t%6s\t%s\n", "-", "-", "-", "-", "-", "-", "-", typename); continue; } for (l = ZB_TOTAL - 1; l >= -1; l--) { level = (l == -1 ? ZB_TOTAL : l); zb = &zcb.zcb_type[level][t]; if (zb->zb_asize == 0) continue; if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; if (level == 0 && zb->zb_asize == zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; zdb_nicenum(zb->zb_count, csize, sizeof (csize)); zdb_nicenum(zb->zb_lsize, lsize, sizeof (lsize)); zdb_nicenum(zb->zb_psize, psize, sizeof (psize)); zdb_nicenum(zb->zb_asize, asize, sizeof (asize)); zdb_nicenum(zb->zb_asize / zb->zb_count, avg, sizeof (avg)); zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", csize, lsize, psize, asize, avg, (double)zb->zb_lsize / zb->zb_psize, 100.0 * zb->zb_asize / tzb->zb_asize); if (level == ZB_TOTAL) (void) printf("%s\n", typename); else (void) printf(" L%d %s\n", level, typename); if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { (void) printf("\t number of ganged " "blocks: %s\n", gang); } if (dump_opt['b'] >= 4) { (void) printf("psize " "(in 512-byte sectors): " "number of blocks\n"); dump_histogram(zb->zb_psize_histogram, PSIZE_HISTO_SIZE, 0); } } } /* Output a table summarizing block sizes in the pool */ if (dump_opt['b'] >= 2) { dump_size_histograms(&zcb); } } (void) printf("\n"); if (leaks) return (2); if (zcb.zcb_haderrors) return (3); return (0); } typedef struct zdb_ddt_entry { ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; uint64_t zdde_ref_psize; uint64_t zdde_ref_dsize; avl_node_t zdde_node; } zdb_ddt_entry_t; /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); zdde = avl_find(t, &zdde_search, &where); if (zdde == NULL) { zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); zdde->zdde_key = zdde_search.zdde_key; avl_insert(t, zdde, where); } zdde->zdde_ref_blocks += 1; zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); zdde->zdde_ref_psize += BP_GET_PSIZE(bp); zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); return (0); } static void dump_simulated_ddt(spa_t *spa) { avl_tree_t t; void *cookie = NULL; zdb_ddt_entry_t *zdde; ddt_histogram_t ddh_total; ddt_stat_t dds_total; bzero(&ddh_total, sizeof (ddh_total)); bzero(&dds_total, sizeof (dds_total)); avl_create(&t, ddt_entry_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { ddt_stat_t dds; uint64_t refcnt = zdde->zdde_ref_blocks; ASSERT(refcnt != 0); dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; dds.dds_psize = zdde->zdde_ref_psize / refcnt; dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; dds.dds_ref_blocks = zdde->zdde_ref_blocks; dds.dds_ref_lsize = zdde->zdde_ref_lsize; dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], &dds, 0); umem_free(zdde, sizeof (*zdde)); } avl_destroy(&t); ddt_histogram_stat(&dds_total, &ddh_total); (void) printf("Simulated DDT histogram:\n"); zpool_dump_ddt(&dds_total, &ddh_total); dump_dedup_ratio(&dds_total); } static int verify_device_removal_feature_counts(spa_t *spa) { uint64_t dr_feature_refcount = 0; uint64_t oc_feature_refcount = 0; uint64_t indirect_vdev_count = 0; uint64_t precise_vdev_count = 0; uint64_t obsolete_counts_object_count = 0; uint64_t obsolete_sm_count = 0; uint64_t obsolete_counts_count = 0; uint64_t scip_count = 0; uint64_t obsolete_bpobj_count = 0; int ret = 0; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; if (scip->scip_next_mapping_object != 0) { vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); (void) printf("Condensing indirect vdev %llu: new mapping " "object %llu, prev obsolete sm %llu\n", (u_longlong_t)scip->scip_vdev, (u_longlong_t)scip->scip_next_mapping_object, (u_longlong_t)scip->scip_prev_obsolete_sm_object); if (scip->scip_prev_obsolete_sm_object != 0) { space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); } scip_count += 2; } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (vic->vic_mapping_object != 0) { ASSERT(vd->vdev_ops == &vdev_indirect_ops || vd->vdev_removing); indirect_vdev_count++; if (vd->vdev_indirect_mapping->vim_havecounts) { obsolete_counts_count++; } } boolean_t are_precise; VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (are_precise) { ASSERT(vic->vic_mapping_object != 0); precise_vdev_count++; } uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vic->vic_mapping_object != 0); obsolete_sm_count++; } } (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], &dr_feature_refcount); (void) feature_get_refcount(spa, &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], &oc_feature_refcount); if (dr_feature_refcount != indirect_vdev_count) { ret = 1; (void) printf("Number of indirect vdevs (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)indirect_vdev_count, (u_longlong_t)dr_feature_refcount); } else { (void) printf("Verified device_removal feature refcount " \ "of %llu is correct\n", (u_longlong_t)dr_feature_refcount); } if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ) == 0) { obsolete_bpobj_count++; } obsolete_counts_object_count = precise_vdev_count; obsolete_counts_object_count += obsolete_sm_count; obsolete_counts_object_count += obsolete_counts_count; obsolete_counts_object_count += scip_count; obsolete_counts_object_count += obsolete_bpobj_count; obsolete_counts_object_count += remap_deadlist_count; if (oc_feature_refcount != obsolete_counts_object_count) { ret = 1; (void) printf("Number of obsolete counts objects (%llu) " \ "does not match feature count (%llu)\n", (u_longlong_t)obsolete_counts_object_count, (u_longlong_t)oc_feature_refcount); (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " "ob:%llu rd:%llu\n", (u_longlong_t)precise_vdev_count, (u_longlong_t)obsolete_sm_count, (u_longlong_t)obsolete_counts_count, (u_longlong_t)scip_count, (u_longlong_t)obsolete_bpobj_count, (u_longlong_t)remap_deadlist_count); } else { (void) printf("Verified indirect_refcount feature refcount " \ "of %llu is correct\n", (u_longlong_t)oc_feature_refcount); } return (ret); } static void zdb_set_skip_mmp(char *target) { spa_t *spa; /* * Disable the activity check to allow examination of * active pools. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL) { spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; } mutex_exit(&spa_namespace_lock); } #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" /* * Import the checkpointed state of the pool specified by the target * parameter as readonly. The function also accepts a pool config * as an optional parameter, else it attempts to infer the config by * the name of the target pool. * * Note that the checkpointed state's pool name will be the name of * the original pool with the above suffix appended to it. In addition, * if the target is not a pool name (e.g. a path to a dataset) then * the new_path parameter is populated with the updated path to * reflect the fact that we are looking into the checkpointed state. * * The function returns a newly-allocated copy of the name of the * pool containing the checkpointed state. When this copy is no * longer needed it should be freed with free(3C). Same thing * applies to the new_path parameter if allocated. */ static char * import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; boolean_t freecfg = B_FALSE; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); if (path_start != NULL) { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); } else { poolname = target; } if (cfg == NULL) { zdb_set_skip_mmp(poolname); error = spa_get_stats(poolname, &cfg, NULL, 0); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", poolname, error); } freecfg = B_TRUE; } if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) return (NULL); fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | ZFS_IMPORT_SKIP_MMP); if (freecfg) nvlist_free(cfg); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); } if (new_path != NULL && path_start != NULL) { if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { if (path_start != NULL) free(poolname); return (NULL); } } if (target != poolname) free(poolname); return (bogus_name); } typedef struct verify_checkpoint_sm_entry_cb_arg { vdev_t *vcsec_vd; /* the following fields are only used for printing progress */ uint64_t vcsec_entryid; uint64_t vcsec_num_entries; } verify_checkpoint_sm_entry_cb_arg_t; #define ENTRIES_PER_PROGRESS_UPDATE 10000 static int verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) { verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; vdev_t *vd = vcsec->vcsec_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; ASSERT(sme->sme_type == SM_FREE); if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { (void) fprintf(stderr, "\rverifying vdev %llu, space map entry %llu of %llu ...", (longlong_t)vd->vdev_id, (longlong_t)vcsec->vcsec_entryid, (longlong_t)vcsec->vcsec_num_entries); } vcsec->vcsec_entryid++; /* * See comment in checkpoint_sm_exclude_entry_cb() */ VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * The entries in the vdev_checkpoint_sm should be marked as * allocated in the checkpointed state of the pool, therefore * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); range_tree_verify_not_present(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); } /* * Verify that all segments in the vdev_checkpoint_sm are allocated * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's * ms_allocatable). * * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of * each vdev in the current state of the pool to the metaslab space maps * (ms_sm) of the checkpointed state of the pool. * * Note that the function changes the state of the ms_allocatable * trees of the current spa_t. The entries of these ms_allocatable * trees are cleared out and then repopulated from with the free * entries of their respective ms_sm space maps. */ static void verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; vdev_t *current_vd = current_rvd->vdev_child[c]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * Since we don't allow device removal in a pool * that has a checkpoint, we expect that all removed * vdevs were removed from the pool before the * checkpoint. */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } /* * If the checkpoint space map doesn't exist, then nothing * here is checkpointed so there's nothing to verify. */ if (current_vd->vdev_top_zap == 0 || zap_contains(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(current), current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; vcsec.vcsec_entryid = 0; vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); if (dump_opt['m'] > 3) dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } /* * If we've added vdevs since we took the checkpoint, ensure * that their checkpoint space maps are empty. */ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } /* * Verifies that all space that's allocated in the checkpoint is * still allocated in the current version, by checking that everything * in checkpoint's ms_allocatable (which is actually allocated, not * allocatable/free) is not present in current's ms_allocatable. * * Note that the function changes the state of the ms_allocatable * trees of both spas when called. The entries of all ms_allocatable * trees are cleared out and then repopulated from their respective * ms_sm space maps. In the checkpointed state we load the allocated * entries, and in the current state we load the free entries. */ static void verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) { vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; vdev_t *current_rvd = current->spa_root_vdev; load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); load_concrete_ms_allocatable_trees(current, SM_FREE); for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; vdev_t *current_vd = current_rvd->vdev_child[i]; if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { /* * See comment in verify_checkpoint_vdev_spacemaps() */ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); continue; } for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; metaslab_t *current_msp = current_vd->vdev_ms[m]; (void) fprintf(stderr, "\rverifying vdev %llu of %llu, " "metaslab %llu of %llu ...", (longlong_t)current_vd->vdev_id, (longlong_t)current_rvd->vdev_children, (longlong_t)current_vd->vdev_ms[m]->ms_id, (longlong_t)current_vd->vdev_ms_count); /* * We walk through the ms_allocatable trees that * are loaded with the allocated blocks from the * ms_sm spacemaps of the checkpoint. For each * one of these ranges we ensure that none of them * exists in the ms_allocatable trees of the * current state which are loaded with the ranges * that are currently free. * * This way we ensure that none of the blocks that * are part of the checkpoint were freed by mistake. */ range_tree_walk(ckpoint_msp->ms_allocatable, (range_tree_func_t *)range_tree_verify_not_present, current_msp->ms_allocatable); } } /* for cleaner progress output */ (void) fprintf(stderr, "\n"); } static void verify_checkpoint_blocks(spa_t *spa) { ASSERT(!dump_opt['L']); spa_t *checkpoint_spa; char *checkpoint_pool; int error = 0; /* * We import the checkpointed state of the pool (under a different * name) so we can do verification on it against the current state * of the pool. */ checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but spa_open() failed with " "error %d\n", checkpoint_pool, error); } /* * Ensure that ranges in the checkpoint space maps of each vdev * are allocated according to the checkpointed state's metaslab * space maps. */ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); /* * Ensure that allocated ranges in the checkpoint's metaslab * space maps remain allocated in the metaslab space maps of * the current state. */ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); /* * Once we are done, we get rid of the checkpointed state. */ spa_close(checkpoint_spa, FTAG); free(checkpoint_pool); } static void dump_leftover_checkpoint_blocks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; space_map_t *checkpoint_sm = NULL; uint64_t checkpoint_sm_obj; if (vd->vdev_top_zap == 0) continue; if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) continue; VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &checkpoint_sm_obj)); VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } } static int verify_checkpoint(spa_t *spa) { uberblock_t checkpoint; int error; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (0); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT && !dump_opt['L']) { /* * If the feature is active but the uberblock is missing * then we must be in the middle of discarding the * checkpoint. */ (void) printf("\nPartially discarded checkpoint " "state found:\n"); if (dump_opt['m'] > 3) dump_leftover_checkpoint_blocks(spa); return (0); } else if (error != 0) { (void) printf("lookup error %d when looking for " "checkpointed uberblock in MOS\n", error); return (error); } dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); if (checkpoint.ub_checkpoint_txg == 0) { (void) printf("\nub_checkpoint_txg not set in checkpointed " "uberblock\n"); error = 3; } if (error == 0 && !dump_opt['L']) verify_checkpoint_blocks(spa); return (error); } /* ARGSUSED */ static void mos_leaks_cb(void *arg, uint64_t start, uint64_t size) { for (uint64_t i = start; i < size; i++) { (void) printf("MOS object %llu referenced but not allocated\n", (u_longlong_t)i); } } static void mos_obj_refd(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL) range_tree_add(mos_refd_objs, obj, 1); } /* * Call on a MOS object that may already have been referenced. */ static void mos_obj_refd_multiple(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL && !range_tree_contains(mos_refd_objs, obj, 1)) range_tree_add(mos_refd_objs, obj, 1); } static void mos_leak_vdev_top_zap(vdev_t *vd) { uint64_t ms_flush_data_obj; int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(ms_flush_data_obj); } static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); if (vd->vdev_checkpoint_sm != NULL) mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); if (vd->vdev_indirect_mapping != NULL) { mos_obj_refd(vd->vdev_indirect_mapping-> vim_phys->vimp_counts_object); } if (vd->vdev_obsolete_sm != NULL) mos_obj_refd(vd->vdev_obsolete_sm->sm_object); for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *ms = vd->vdev_ms[m]; mos_obj_refd(space_map_object(ms->ms_sm)); } if (vd->vdev_top_zap != 0) { mos_obj_refd(vd->vdev_top_zap); mos_leak_vdev_top_zap(vd); } for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } static void mos_leak_log_spacemaps(spa_t *spa) { uint64_t spacemap_zap; int error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); if (error == ENOENT) return; ASSERT0(error); mos_obj_refd(spacemap_zap); for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) mos_obj_refd(sls->sls_sm_obj); } static int dump_mos_leaks(spa_t *spa) { int rv = 0; objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; /* Visit and mark all referenced objects in the MOS */ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); mos_obj_refd(spa->spa_pool_props_object); mos_obj_refd(spa->spa_config_object); mos_obj_refd(spa->spa_ddt_stat_object); mos_obj_refd(spa->spa_feat_desc_obj); mos_obj_refd(spa->spa_feat_enabled_txg_obj); mos_obj_refd(spa->spa_feat_for_read_obj); mos_obj_refd(spa->spa_feat_for_write_obj); mos_obj_refd(spa->spa_history); mos_obj_refd(spa->spa_errlog_last); mos_obj_refd(spa->spa_errlog_scrub); mos_obj_refd(spa->spa_all_vdev_zaps); mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); bpobj_count_refd(&spa->spa_deferred_bpobj); mos_obj_refd(dp->dp_empty_bpobj); bpobj_count_refd(&dp->dp_obsolete_bpobj); bpobj_count_refd(&dp->dp_free_bpobj); mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); if (spa->spa_syncing_log_sm != NULL) mos_obj_refd(spa->spa_syncing_log_sm->sm_object); mos_leak_log_spacemaps(spa); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. scip_prev_obsolete_sm_object); if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(mos, spa->spa_condensing_indirect_phys.scip_next_mapping_object); mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } deleted_livelists_dump_mos(spa); if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; dsl_pool_config_enter(dp, FTAG); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) count_dir_mos_objects(dp->dp_free_dir); if (dp->dp_leak_dir != NULL) count_dir_mos_objects(dp->dp_leak_dir); mos_leak_vdev(spa->spa_root_vdev); for (uint64_t class = 0; class < DDT_CLASSES; class++) { for (uint64_t type = 0; type < DDT_TYPES; type++) { for (uint64_t cksum = 0; cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { ddt_t *ddt = spa->spa_ddt[cksum]; mos_obj_refd(ddt->ddt_object[type][class]); } } } /* * Visit all allocated objects and make sure they are referenced. */ uint64_t object = 0; while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { if (range_tree_contains(mos_refd_objs, object, 1)) { range_tree_remove(mos_refd_objs, object, 1); } else { dmu_object_info_t doi; const char *name; dmu_object_info(mos, object, &doi); if (doi.doi_type & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(doi.doi_type); name = dmu_ot_byteswap[bswap].ob_name; } else { name = dmu_ot[doi.doi_type].ot_name; } (void) printf("MOS object %llu (%s) leaked\n", (u_longlong_t)object, name); rv = 2; } } (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); if (!range_tree_is_empty(mos_refd_objs)) rv = 2; range_tree_vacate(mos_refd_objs, NULL, NULL); range_tree_destroy(mos_refd_objs); return (rv); } typedef struct log_sm_obsolete_stats_arg { uint64_t lsos_current_txg; uint64_t lsos_total_entries; uint64_t lsos_valid_entries; uint64_t lsos_sm_entries; uint64_t lsos_valid_sm_entries; } log_sm_obsolete_stats_arg_t; static int log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) { log_sm_obsolete_stats_arg_t *lsos = arg; uint64_t offset = sme->sme_offset; uint64_t vdev_id = sme->sme_vdev; if (lsos->lsos_current_txg == 0) { /* this is the first log */ lsos->lsos_current_txg = txg; } else if (lsos->lsos_current_txg < txg) { /* we just changed log - print stats and reset */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos->lsos_valid_sm_entries, (u_longlong_t)lsos->lsos_sm_entries, (u_longlong_t)lsos->lsos_current_txg); lsos->lsos_valid_sm_entries = 0; lsos->lsos_sm_entries = 0; lsos->lsos_current_txg = txg; } ASSERT3U(lsos->lsos_current_txg, ==, txg); lsos->lsos_sm_entries++; lsos->lsos_total_entries++; vdev_t *vd = vdev_lookup_top(spa, vdev_id); if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); if (txg < metaslab_unflushed_txg(ms)) return (0); lsos->lsos_valid_sm_entries++; lsos->lsos_valid_entries++; return (0); } static void dump_log_spacemap_obsolete_stats(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; log_sm_obsolete_stats_arg_t lsos; bzero(&lsos, sizeof (lsos)); (void) printf("Log Space Map Obsolete Entry Statistics:\n"); iterate_through_spacemap_logs(spa, log_spacemap_obsolete_stats_cb, &lsos); /* print stats for latest log */ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", (u_longlong_t)lsos.lsos_valid_sm_entries, (u_longlong_t)lsos.lsos_sm_entries, (u_longlong_t)lsos.lsos_current_txg); (void) printf("%-8llu valid entries out of %-8llu - total\n\n", (u_longlong_t)lsos.lsos_valid_entries, (u_longlong_t)lsos.lsos_total_entries); } static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; if (dump_opt['y']) { livelist_metaslab_validate(spa); } if (dump_opt['S']) { dump_simulated_ddt(spa); return; } if (!dump_opt['e'] && dump_opt['C'] > 1) { (void) printf("\nCached configuration:\n"); dump_nvlist(spa->spa_config, 8); } if (dump_opt['C']) dump_config(spa); if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); if (dump_opt['D']) dump_all_ddts(spa); if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); if (dump_opt['d'] > 2 || dump_opt['m']) { dump_log_spacemaps(spa); dump_log_spacemap_obsolete_stats(spa); } if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_full_bpobj(&dp->dp_free_bpobj, "Pool snapshot frees", 0); } if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)); dump_full_bpobj(&dp->dp_obsolete_bpobj, "Pool obsolete blocks", 0); } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, dp->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } for (spa_feature_t f = 0; f < SPA_FEATURES; f++) global_feature_count[f] = UINT64_MAX; global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; global_feature_count[SPA_FEATURE_LIVELIST] = 0; (void) dmu_objset_find(spa_name(spa), dump_one_objset, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) rc = dump_mos_leaks(spa); for (f = 0; f < SPA_FEATURES; f++) { uint64_t refcount; uint64_t *arr; if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { if (global_feature_count[f] == UINT64_MAX) continue; if (!spa_feature_is_enabled(spa, f)) { ASSERT0(global_feature_count[f]); continue; } arr = global_feature_count; } else { if (!spa_feature_is_enabled(spa, f)) { ASSERT0(dataset_feature_count[f]); continue; } arr = dataset_feature_count; } if (feature_get_refcount(spa, &spa_feature_table[f], &refcount) == ENOTSUP) continue; if (arr[f] != refcount) { (void) printf("%s feature refcount mismatch: " "%lld consumers != %lld refcount\n", spa_feature_table[f].fi_uname, (longlong_t)arr[f], (longlong_t)refcount); rc = 2; } else { (void) printf("Verified %s feature refcount " "of %llu is correct\n", spa_feature_table[f].fi_uname, (longlong_t)refcount); } } if (rc == 0) rc = verify_device_removal_feature_counts(spa); } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) rc = verify_spacemap_refcounts(spa); if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); if (rc == 0) rc = verify_checkpoint(spa); if (rc != 0) { dump_debug_buffer(); exit(rc); } } #define ZDB_FLAG_CHECKSUM 0x0001 #define ZDB_FLAG_DECOMPRESS 0x0002 #define ZDB_FLAG_BSWAP 0x0004 #define ZDB_FLAG_GBH 0x0008 #define ZDB_FLAG_INDIRECT 0x0010 #define ZDB_FLAG_RAW 0x0020 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 #define ZDB_FLAG_VERBOSE 0x0080 static int flagbits[256]; static char flagbitstr[16]; static void zdb_print_blkptr(const blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static void zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) { int i; for (i = 0; i < nbps; i++) zdb_print_blkptr(&bp[i], flags); } static void zdb_dump_gbh(void *buf, int flags) { zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); } static void zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); VERIFY(write(fileno(stdout), buf, size) == size); } static void zdb_dump_block(char *label, void *buf, uint64_t size, int flags) { uint64_t *d = (uint64_t *)buf; unsigned nwords = size / sizeof (uint64_t); int do_bswap = !!(flags & ZDB_FLAG_BSWAP); unsigned i, j; const char *hdr; char *c; if (do_bswap) hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; else hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); #ifdef _LITTLE_ENDIAN /* correct the endianness */ do_bswap = !do_bswap; #endif for (i = 0; i < nwords; i += 2) { (void) printf("%06llx: %016llx %016llx ", (u_longlong_t)(i * sizeof (uint64_t)), (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); c = (char *)&d[i]; for (j = 0; j < 2 * sizeof (uint64_t); j++) (void) printf("%c", isprint(c[j]) ? c[j] : '.'); (void) printf("\n"); } } /* * There are two acceptable formats: * leaf_name - For example: c1t0d0 or /tmp/ztest.0a * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere * in the hierarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * zdb_vdev_lookup(vdev_t *vdev, const char *path) { char *s, *p, *q; unsigned i; if (vdev == NULL) return (NULL); /* First, assume the x.x.x.x format */ i = strtoul(path, &s, 10); if (s == path || (s && *s != '.' && *s != '\0')) goto name; if (i >= vdev->vdev_children) return (NULL); vdev = vdev->vdev_child[i]; if (s && *s == '\0') return (vdev); return (zdb_vdev_lookup(vdev, s+1)); name: for (i = 0; i < vdev->vdev_children; i++) { vdev_t *vc = vdev->vdev_child[i]; if (vc->vdev_path == NULL) { vc = zdb_vdev_lookup(vc, path); if (vc == NULL) continue; else return (vc); } p = strrchr(vc->vdev_path, '/'); p = p ? p + 1 : vc->vdev_path; q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; if (strcmp(vc->vdev_path, path) == 0) return (vc); if (strcmp(p, path) == 0) return (vc); if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) return (vc); } return (NULL); } static int name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) { dsl_dataset_t *ds; dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, NULL, &ds); if (error != 0) { (void) fprintf(stderr, "failed to hold objset %llu: %s\n", (u_longlong_t)objset_id, strerror(error)); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); return (error); } dsl_dataset_name(ds, outstr); dsl_dataset_rele(ds, NULL); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); return (0); } static boolean_t zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) { char *s0, *s1, *tmp = NULL; if (sizes == NULL) return (B_FALSE); s0 = strtok_r(sizes, "/", &tmp); if (s0 == NULL) return (B_FALSE); s1 = strtok_r(NULL, "/", &tmp); *lsize = strtoull(s0, NULL, 16); *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; return (*lsize >= *psize && *psize > 0); } #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) static boolean_t zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, uint64_t psize, int flags) { boolean_t exceeded = B_FALSE; /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. */ void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; int *cfuncp = cfuncs; uint64_t maxlsize = SPA_MAXBLOCKSIZE; uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); *cfuncp++ = ZIO_COMPRESS_LZ4; *cfuncp++ = ZIO_COMPRESS_LZJB; mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) if (((1ULL << c) & mask) == 0) *cfuncp++ = c; /* * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this * could take a while and we should let the user know * we are not stuck. On the other hand, printing progress * info gets old after a while. User can specify 'v' flag * to see the progression. */ if (lsize == psize) lsize += SPA_MINBLOCKSIZE; else maxlsize = lsize; for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { for (cfuncp = cfuncs; *cfuncp; cfuncp++) { if (flags & ZDB_FLAG_VERBOSE) { (void) fprintf(stderr, "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[*cfuncp].\ ci_name); } /* * We randomize lbuf2, and decompress to both * lbuf and lbuf2. This way, we will know if * decompression fill exactly to lsize. */ VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); if (zio_decompress_data(*cfuncp, pabd, lbuf, psize, lsize, NULL) == 0 && zio_decompress_data(*cfuncp, pabd, lbuf2, psize, lsize, NULL) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } if (*cfuncp != 0) break; } umem_free(lbuf2, SPA_MAXBLOCKSIZE); if (lsize > maxlsize) { exceeded = B_TRUE; } if (*cfuncp == ZIO_COMPRESS_ZLE) { printf("\nZLE decompression was selected. If you " "suspect the results are wrong,\ntry avoiding ZLE " "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); } return (exceeded); } /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: * * pool:vdev_specifier:offset:[lsize/]psize[:flags] * * pool - The name of the pool you wish to read from * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) * offset - offset, in hex, in bytes * size - Amount of data to read, in hex, in bytes * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * c: Calculate and display checksums * d: Decompress data before dumping * e: Byteswap data before dumping * g: Display data as a gang block header * i: Display as an indirect block * r: Dump raw data to stdout * v: Verbose * */ static void zdb_read_block(char *thing, spa_t *spa) { blkptr_t blk, *bp = &blk; dva_t *dva = bp->blk_dva; int flags = 0; uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; abd_t *pabd; void *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr, *sizes, *tmp = NULL; int i, error; boolean_t borrowed = B_FALSE, found = B_FALSE; dup = strdup(thing); s = strtok_r(dup, ":", &tmp); vdev = s ? s : ""; s = strtok_r(NULL, ":", &tmp); offset = strtoull(s ? s : "", NULL, 16); sizes = strtok_r(NULL, ":", &tmp); s = strtok_r(NULL, ":", &tmp); flagstr = strdup(s ? s : ""); s = NULL; tmp = NULL; if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) s = "invalid size(s)"; if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) s = "size must be a multiple of sector size"; if (!IS_P2ALIGNED(offset, DEV_BSIZE)) s = "offset must be a multiple of sector size"; if (s) { (void) printf("Invalid block specifier: %s - %s\n", thing, s); goto done; } for (s = strtok_r(flagstr, ":", &tmp); s != NULL; s = strtok_r(NULL, ":", &tmp)) { for (i = 0; i < strlen(flagstr); i++) { int bit = flagbits[(uchar_t)flagstr[i]]; if (bit == 0) { (void) printf("***Ignoring flag: %c\n", (uchar_t)flagstr[i]); continue; } found = B_TRUE; flags |= bit; p = &flagstr[i + 1]; if (*p != ':' && *p != '\0') { int j = 0, nextbit = flagbits[(uchar_t)*p]; char *end, offstr[8] = { 0 }; if ((bit == ZDB_FLAG_PRINT_BLKPTR) && (nextbit == 0)) { /* look ahead to isolate the offset */ while (nextbit == 0 && strchr(flagbitstr, *p) == NULL) { offstr[j] = *p; j++; if (i + j > strlen(flagstr)) break; p++; nextbit = flagbits[(uchar_t)*p]; } blkptr_offset = strtoull(offstr, &end, 16); i += j; } else if (nextbit == 0) { (void) printf("***Ignoring flag arg:" " '%c'\n", (uchar_t)*p); } } } } if (blkptr_offset % sizeof (blkptr_t)) { printf("Block pointer offset 0x%llx " "must be divisible by 0x%x\n", (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); goto done; } if (found == B_FALSE && strlen(flagstr) > 0) { printf("Invalid flag arg: '%s'\n", flagstr); goto done; } vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); free(dup); return; } else { if (vd->vdev_path) (void) fprintf(stderr, "Found vdev: %s\n", vd->vdev_path); else (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); if (vd == vd->vdev_top) { /* * Treat this as a normal block read. */ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); if (error) { (void) printf("Read of %s failed, error: %d\n", thing, error); goto out; } uint64_t orig_lsize = lsize; buf = lbuf; if (flags & ZDB_FLAG_DECOMPRESS) { boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); if (failed) { (void) printf("Decompress of %s failed\n", thing); goto out; } } else { buf = abd_borrow_buf_copy(pabd, lsize); borrowed = B_TRUE; } /* * Try to detect invalid block pointer. If invalid, try * decompressing. */ if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && !(flags & ZDB_FLAG_DECOMPRESS)) { const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == B_FALSE) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, lsize, psize, flags); b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (failed || zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_LOG) == B_FALSE) { printf("invalid block pointer at this DVA\n"); goto out; } } } if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); else if (flags & ZDB_FLAG_RAW) zdb_dump_block_raw(buf, lsize, flags); else if (flags & ZDB_FLAG_INDIRECT) zdb_dump_indirect((blkptr_t *)buf, orig_lsize / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) zdb_dump_gbh(buf, flags); else zdb_dump_block(thing, buf, lsize, flags); /* * If :c was specified, iterate through the checksum table to * calculate and display each checksum for our specified * DVA and length. */ if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && !(flags & ZDB_FLAG_GBH)) { zio_t *czio; (void) printf("\n"); for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { if ((zio_checksum_table[ck].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) || ck == ZIO_CHECKSUM_NOPARITY) { continue; } BP_SET_CHECKSUM(bp, ck); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); czio->io_bp = bp; if (vd == vd->vdev_top) { zio_nowait(zio_read(czio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_DONT_RETRY, NULL)); } else { zio_nowait(zio_vdev_child_io(czio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(czio); if (error == 0 || error == ECKSUM) { zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); ck_zio->io_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; zio_checksum_compute(ck_zio, ck, pabd, lsize); printf("%12s\tcksum=%llx:%llx:%llx:%llx\n", zio_checksum_table[ck].ci_name, (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], (u_longlong_t)bp->blk_cksum.zc_word[2], (u_longlong_t)bp->blk_cksum.zc_word[3]); zio_wait(ck_zio); } else { printf("error %d reading block\n", error); } spa_config_exit(spa, SCL_STATE, FTAG); } } if (borrowed) abd_return_buf_copy(pabd, buf, lsize); out: abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); done: free(flagstr); free(dup); } static void zdb_embedded_block(char *thing) { blkptr_t bp; unsigned long long *words = (void *)&bp; char *buf; int err; bzero(&bp, sizeof (bp)); err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", words + 0, words + 1, words + 2, words + 3, words + 4, words + 5, words + 6, words + 7, words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { (void) fprintf(stderr, "invalid input format\n"); exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); buf = malloc(SPA_MAXBLOCKSIZE); if (buf == NULL) { (void) fprintf(stderr, "out of memory\n"); exit(1); } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { (void) fprintf(stderr, "decode failed: %u\n", err); exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); free(buf); } int main(int argc, char **argv) { int c; struct rlimit rl = { 1024, 1024 }; spa_t *spa = NULL; objset_t *os = NULL; int dump_all = 1; int verbose = 0; int error = 0; char **searchdirs = NULL; int nsearch = 0; char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int64_t objset_id = -1; uint64_t object; int flags = ZFS_IMPORT_MISSING_LOG; int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env, *objset_str; boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; nvlist_t *cfg = NULL; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); /* * If there is an environment variable SPA_CONFIG_PATH it overrides * default spa_config_path setting. If -U flag is specified it will * override this environment variable settings once again. */ spa_config_path_env = getenv("SPA_CONFIG_PATH"); if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; /* * For performance reasons, we set this tunable down. We do so before * the arg parsing section so that the user can override this value if * they choose. */ zfs_btree_verify_intensity = 3; while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqrRsSt:uU:vVx:XYyZ")) != -1) { switch (c) { case 'b': case 'c': case 'C': case 'd': case 'D': case 'E': case 'G': case 'h': case 'i': case 'l': case 'm': case 'M': case 'O': case 'r': case 'R': case 's': case 'S': case 'u': case 'y': case 'Z': dump_opt[c]++; dump_all = 0; break; case 'A': case 'e': case 'F': case 'k': case 'L': case 'P': case 'q': case 'X': dump_opt[c]++; break; case 'Y': zfs_reconstruct_indirect_combinations_max = INT_MAX; zfs_deadman_enabled = 0; break; /* NB: Sort single match options below. */ case 'I': max_inflight_bytes = strtoull(optarg, NULL, 0); if (max_inflight_bytes == 0) { (void) fprintf(stderr, "maximum number " "of inflight bytes must be greater " "than 0\n"); usage(); } break; case 'o': error = set_global_var(optarg); if (error != 0) usage(); break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); } else { char **tmp = umem_alloc((nsearch + 1) * sizeof (char *), UMEM_NOFAIL); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { (void) fprintf(stderr, "incorrect txg " "specified: %s\n", optarg); usage(); } break; case 'U': spa_config_path = optarg; if (spa_config_path[0] != '/') { (void) fprintf(stderr, "cachefile must be an absolute path " "(i.e. start with a slash)\n"); usage(); } break; case 'v': verbose++; break; case 'V': flags = ZFS_IMPORT_VERBATIM; break; case 'x': vn_dumpdir = optarg; break; default: usage(); break; } } if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } if (dump_opt['d'] || dump_opt['r']) { /* [/ is accepted */ if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL && objset_str++ != NULL) { char *endptr; errno = 0; objset_id = strtoull(objset_str, &endptr, 0); /* dataset 0 is the same as opening the pool */ if (errno == 0 && endptr != objset_str && objset_id != 0) { target_is_spa = B_FALSE; dataset_lookup = B_TRUE; } else if (objset_id != 0) { printf("failed to open objset %s " "%llu %s", objset_str, (u_longlong_t)objset_id, strerror(errno)); exit(1); } /* normal dataset name not an objset ID */ if (endptr == objset_str) { objset_id = -1; } } } #if defined(_LP64) /* * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; #endif /* * "zdb -c" uses checksum-verifying scrub i/os which are async reads. * "zdb -b" uses traversal prefetch which uses async reads. * For good performance, let several of them be active at once. */ zfs_vdev_async_read_max_active = 10; /* * Disable reference tracking for better performance. */ reference_tracking_enable = B_FALSE; /* * Do not fail spa_load when spa_load_verify fails. This is needed * to load non-idle pools. */ spa_load_verify_dryrun = B_TRUE; kernel_init(SPA_MODE_READ); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { if (dump_all && strchr("AeEFklLOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } libspl_assert_ok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); zfs_recover = (dump_opt['A'] > 1); argc -= optind; argv += optind; if (argc < 2 && dump_opt['R']) usage(); if (dump_opt['E']) { if (argc != 1) usage(); zdb_embedded_block(argv[0]); return (0); } if (argc < 1) { if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); return (0); } usage(); } if (dump_opt['l']) return (dump_label(argv[0])); if (dump_opt['O']) { if (argc != 2) usage(); dump_opt['v'] = verbose + 3; return (dump_path(argv[0], argv[1], NULL)); } if (dump_opt['r']) { if (argc != 3) usage(); dump_opt['v'] = verbose; error = dump_path(argv[0], argv[1], &object); } if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) fatal("internal error: %s", strerror(ENOMEM)); error = 0; target = argv[0]; if (strpbrk(target, "/@") != NULL) { size_t targetlen; target_pool = strdup(target); *strpbrk(target_pool, "/@") = '\0'; target_is_spa = B_FALSE; targetlen = strlen(target); if (targetlen && target[targetlen - 1] == '/') target[targetlen - 1] = '\0'; } else { target_pool = target; } if (dump_opt['e']) { importargs_t args = { 0 }; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_TRUE; error = zpool_find_config(NULL, target_pool, &cfg, &args, &libzpool_config_ops); if (error == 0) { if (nvlist_add_nvlist(cfg, ZPOOL_LOAD_POLICY, policy) != 0) { fatal("can't open '%s': %s", target, strerror(ENOMEM)); } if (dump_opt['C'] > 1) { (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } /* * Disable the activity check to allow examination of * active pools. */ error = spa_import(target_pool, cfg, NULL, flags | ZFS_IMPORT_SKIP_MMP); } } if (searchdirs != NULL) { umem_free(searchdirs, nsearch * sizeof (char *)); searchdirs = NULL; } /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa * namespace. Because of that we need to make sure to call * it always after the -e option has been processed, which * imports the pool to the namespace if it's not in the * cachefile. */ char *checkpoint_pool = NULL; char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; } if (cfg != NULL) { nvlist_free(cfg); cfg = NULL; } if (target_pool != target) free(target_pool); if (error == 0) { if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { ASSERT(checkpoint_pool != NULL); ASSERT(checkpoint_target == NULL); error = spa_open(checkpoint_pool, &spa, FTAG); if (error != 0) { fatal("Tried to open pool \"%s\" but " "spa_open() failed with error %d\n", checkpoint_pool, error); } } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { /* * If we're missing the log device then * try opening the pool after clearing the * log state. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(target)) != NULL && spa->spa_log_state == SPA_LOG_MISSING) { spa->spa_log_state = SPA_LOG_CLEAR; error = 0; } mutex_exit(&spa_namespace_lock); if (!error) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); } } } else if (strpbrk(target, "#") != NULL) { dsl_pool_t *dp; error = dsl_pool_hold(target, FTAG, &dp); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } error = dump_bookmark(dp, target, B_TRUE, verbose > 1); dsl_pool_rele(dp, FTAG); if (error != 0) { fatal("can't dump '%s': %s", target, strerror(error)); } return (error); } else { zdb_set_skip_mmp(target); if (dataset_lookup == B_TRUE) { /* * Use the supplied id to get the name * for open_objset. */ error = spa_open(target, &spa, FTAG); if (error == 0) { error = name_from_objset_id(spa, objset_id, dsname); spa_close(spa, FTAG); if (error == 0) target = dsname; } } if (error == 0) error = open_objset(target, FTAG, &os); if (error == 0) spa = dmu_objset_spa(os); } } nvlist_free(policy); if (error) fatal("can't open '%s': %s", target, strerror(error)); /* * Set the pool failure mode to panic in order to prevent the pool * from suspending. A suspended I/O will have no way to resume and * can prevent the zdb(8) command from terminating as expected. */ if (spa != NULL) spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; argv++; argc--; if (dump_opt['r']) { error = zdb_copy_object(os, object, argv[1]); } else if (!dump_opt['R']) { flagbits['d'] = ZOR_FLAG_DIRECTORY; flagbits['f'] = ZOR_FLAG_PLAIN_FILE; flagbits['m'] = ZOR_FLAG_SPACE_MAP; flagbits['z'] = ZOR_FLAG_ZAP; flagbits['A'] = ZOR_FLAG_ALL_TYPES; if (argc > 0 && dump_opt['d']) { zopt_object_args = argc; zopt_object_ranges = calloc(zopt_object_args, sizeof (zopt_object_range_t)); for (unsigned i = 0; i < zopt_object_args; i++) { int err; char *msg = NULL; err = parse_object_range(argv[i], &zopt_object_ranges[i], &msg); if (err != 0) fatal("Bad object or range: '%s': %s\n", argv[i], msg ? msg : ""); } } else if (argc > 0 && dump_opt['m']) { zopt_metaslab_args = argc; zopt_metaslab = calloc(zopt_metaslab_args, sizeof (uint64_t)); for (unsigned i = 0; i < zopt_metaslab_args; i++) { errno = 0; zopt_metaslab[i] = strtoull(argv[i], NULL, 0); if (zopt_metaslab[i] == 0 && errno != 0) fatal("bad number %s: %s", argv[i], strerror(errno)); } } if (os != NULL) { dump_objset(os); } else if (zopt_object_args > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); } else { dump_zpool(spa); } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; flagbits['d'] = ZDB_FLAG_DECOMPRESS; flagbits['e'] = ZDB_FLAG_BSWAP; flagbits['g'] = ZDB_FLAG_GBH; flagbits['i'] = ZDB_FLAG_INDIRECT; flagbits['r'] = ZDB_FLAG_RAW; flagbits['v'] = ZDB_FLAG_VERBOSE; for (int i = 0; i < argc; i++) zdb_read_block(argv[i], spa); } if (dump_opt['k']) { free(checkpoint_pool); if (!target_is_spa) free(checkpoint_target); } if (os != NULL) { close_objset(os, FTAG); } else { spa_close(spa, FTAG); } fuid_table_destroy(); dump_debug_buffer(); kernel_fini(); return (error); } diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh index 76f09061c5b5..ab11dfbc99d5 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh @@ -1,74 +1,75 @@ #!/bin/sh # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License Version 1.0 (CDDL-1.0). # You can obtain a copy of the license from the top-level file # "OPENSOLARIS.LICENSE" or at . # You may not use this file except in compliance with the license. # # CDDL HEADER END # # # Send notification in response to a fault induced statechange # # ZEVENT_SUBCLASS: 'statechange' -# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED' +# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED', 'REMOVED', or 'UNAVAIL' # # Exit codes: # 0: notification sent # 1: notification failed # 2: notification not configured # 3: statechange not relevant # 4: statechange string missing (unexpected) [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" [ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4 if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \ && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \ - && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then + && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "UNAVAIL" ]; then exit 3 fi umask 077 note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)" note_pathname="$(mktemp)" { if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then echo "The number of I/O errors associated with a ZFS device exceeded" echo "acceptable levels. ZFS has marked the device as faulted." elif [ "${ZEVENT_VDEV_STATE_STR}" = "DEGRADED" ] ; then echo "The number of checksum errors associated with a ZFS device" echo "exceeded acceptable levels. ZFS has marked the device as" echo "degraded." else echo "ZFS has detected that a device was removed." fi echo echo " impact: Fault tolerance of the pool may be compromised." echo " eid: ${ZEVENT_EID}" echo " class: ${ZEVENT_SUBCLASS}" echo " state: ${ZEVENT_VDEV_STATE_STR}" echo " host: $(hostname)" echo " time: ${ZEVENT_TIME_STRING}" [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}" [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" [ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}" echo " pool: ${ZEVENT_POOL_GUID}" } > "${note_pathname}" zed_notify "${note_subject}" "${note_pathname}"; rv=$? rm -f "${note_pathname}" exit "${rv}" diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am index aad45d4f7497..fa494c030e1c 100644 --- a/sys/contrib/openzfs/cmd/zpool/Makefile.am +++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am @@ -1,188 +1,189 @@ include $(top_srcdir)/config/Rules.am include $(top_srcdir)/config/Shellcheck.am AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS) DEFAULT_INCLUDES += -I$(srcdir) sbin_PROGRAMS = zpool zpool_SOURCES = \ zpool_iter.c \ zpool_main.c \ zpool_util.c \ zpool_util.h \ zpool_vdev.c if BUILD_FREEBSD zpool_SOURCES += os/freebsd/zpool_vdev_os.c endif if BUILD_LINUX zpool_SOURCES += os/linux/zpool_vdev_os.c endif zpool_LDADD = \ $(abs_top_builddir)/lib/libzfs/libzfs.la \ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ - $(abs_top_builddir)/lib/libuutil/libuutil.la + $(abs_top_builddir)/lib/libuutil/libuutil.la \ + $(abs_top_builddir)/lib/libzutil/libzutil.la zpool_LDADD += $(LTLIBINTL) if BUILD_FREEBSD zpool_LDADD += -lgeom endif zpool_LDADD += -lm $(LIBBLKID_LIBS) $(LIBUUID_LIBS) include $(top_srcdir)/config/CppCheck.am zpoolconfdir = $(sysconfdir)/zfs/zpool.d zpoolexecdir = $(zfsexecdir)/zpool.d EXTRA_DIST = zpool.d/README compatibility.d dist_zpoolexec_SCRIPTS = \ zpool.d/dm-deps \ zpool.d/enc \ zpool.d/encdev \ zpool.d/fault_led \ zpool.d/iostat \ zpool.d/iostat-1s \ zpool.d/iostat-10s \ zpool.d/label \ zpool.d/locate_led \ zpool.d/lsblk \ zpool.d/media \ zpool.d/model \ zpool.d/serial \ zpool.d/ses \ zpool.d/size \ zpool.d/slot \ zpool.d/smart \ zpool.d/smartx \ zpool.d/temp \ zpool.d/health \ zpool.d/r_proc \ zpool.d/w_proc \ zpool.d/r_ucor \ zpool.d/w_ucor \ zpool.d/nonmed \ zpool.d/defect \ zpool.d/hours_on \ zpool.d/realloc \ zpool.d/rep_ucor \ zpool.d/cmd_to \ zpool.d/pend_sec \ zpool.d/off_ucor \ zpool.d/ata_err \ zpool.d/nvme_err \ zpool.d/pwr_cyc \ zpool.d/upath \ zpool.d/vendor \ zpool.d/smart_test \ zpool.d/test_type \ zpool.d/test_status \ zpool.d/test_progress \ zpool.d/test_ended zpoolconfdefaults = \ dm-deps \ enc \ encdev \ fault_led \ iostat \ iostat-1s \ iostat-10s \ label \ locate_led \ lsblk \ media \ model \ serial \ ses \ size \ slot \ smart \ smartx \ temp \ health \ r_proc \ w_proc \ r_ucor \ w_ucor \ nonmed \ defect \ hours_on \ realloc \ rep_ucor \ cmd_to \ pend_sec \ off_ucor \ ata_err \ nvme_err \ pwr_cyc \ upath \ vendor \ smart_test \ test_type \ test_status \ test_progress \ test_ended zpoolcompatdir = $(pkgdatadir)/compatibility.d dist_zpoolcompat_DATA = \ compatibility.d/compat-2018 \ compatibility.d/compat-2019 \ compatibility.d/compat-2020 \ compatibility.d/compat-2021 \ compatibility.d/freebsd-11.0 \ compatibility.d/freebsd-11.2 \ compatibility.d/freebsd-11.3 \ compatibility.d/freenas-9.10.2 \ compatibility.d/grub2 \ compatibility.d/openzfsonosx-1.7.0 \ compatibility.d/openzfsonosx-1.8.1 \ compatibility.d/openzfsonosx-1.9.3 \ compatibility.d/openzfs-2.0-freebsd \ compatibility.d/openzfs-2.0-linux \ compatibility.d/openzfs-2.1-freebsd \ compatibility.d/openzfs-2.1-linux \ compatibility.d/zol-0.6.1 \ compatibility.d/zol-0.6.4 \ compatibility.d/zol-0.6.5 \ compatibility.d/zol-0.7 \ compatibility.d/zol-0.8 # canonical <- alias symbolic link pairs # eg: "2018" is a link to "compat-2018" zpoolcompatlinks = \ "compat-2018 2018" \ "compat-2019 2019" \ "compat-2020 2020" \ "compat-2021 2021" \ "freebsd-11.0 freebsd-11.1" \ "freebsd-11.0 freenas-11.0" \ "freebsd-11.2 freenas-11.2" \ "freebsd-11.3 freebsd-11.4" \ "freebsd-11.3 freebsd-12.0" \ "freebsd-11.3 freebsd-12.1" \ "freebsd-11.3 freebsd-12.2" \ "freebsd-11.3 freenas-11.3" \ "freenas-11.0 freenas-11.1" \ "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ "openzfs-2.0-freebsd truenas-12.0" \ "zol-0.7 ubuntu-18.04" \ "zol-0.8 ubuntu-20.04" install-data-hook: $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)" for f in $(zpoolconfdefaults); do \ test -f "$(DESTDIR)$(zpoolconfdir)/$${f}" -o \ -L "$(DESTDIR)$(zpoolconfdir)/$${f}" || \ ln -s "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \ done for l in $(zpoolcompatlinks); do \ (cd "$(DESTDIR)$(zpoolcompatdir)"; ln -sf $${l} ); \ done diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c index 3d7a0cfc35e6..abfa2b7f6b90 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c @@ -1,768 +1,724 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2016 Igor Kozhukhov . */ #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" /* * Private interface for iterating over pools specified on the command line. * Most consumers will call for_each_pool, but in order to support iostat, we * allow fined grained control through the zpool_list_t interface. */ typedef struct zpool_node { zpool_handle_t *zn_handle; uu_avl_node_t zn_avlnode; int zn_mark; } zpool_node_t; struct zpool_list { boolean_t zl_findall; boolean_t zl_literal; uu_avl_t *zl_avl; uu_avl_pool_t *zl_pool; zprop_list_t **zl_proplist; }; /* ARGSUSED */ static int zpool_compare(const void *larg, const void *rarg, void *unused) { zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle; zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle; const char *lname = zpool_get_name(l); const char *rname = zpool_get_name(r); return (strcmp(lname, rname)); } /* * Callback function for pool_list_get(). Adds the given pool to the AVL tree * of known pools. */ static int add_pool(zpool_handle_t *zhp, void *data) { zpool_list_t *zlp = data; zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); uu_avl_index_t idx; node->zn_handle = zhp; uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { if (zlp->zl_proplist && zpool_expand_proplist(zhp, zlp->zl_proplist, zlp->zl_literal) != 0) { zpool_close(zhp); free(node); return (-1); } uu_avl_insert(zlp->zl_avl, node, idx); } else { zpool_close(zhp); free(node); return (-1); } return (0); } /* * Create a list of pools based on the given arguments. If we're given no * arguments, then iterate over all pools in the system and add them to the AVL * tree. Otherwise, add only those pool explicitly specified on the command * line. */ zpool_list_t * pool_list_get(int argc, char **argv, zprop_list_t **proplist, boolean_t literal, int *err) { zpool_list_t *zlp; zlp = safe_malloc(sizeof (zpool_list_t)); zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t), offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT); if (zlp->zl_pool == NULL) zpool_no_memory(); if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL, UU_DEFAULT)) == NULL) zpool_no_memory(); zlp->zl_proplist = proplist; zlp->zl_literal = literal; if (argc == 0) { (void) zpool_iter(g_zfs, add_pool, zlp); zlp->zl_findall = B_TRUE; } else { int i; for (i = 0; i < argc; i++) { zpool_handle_t *zhp; if ((zhp = zpool_open_canfail(g_zfs, argv[i])) != NULL) { if (add_pool(zhp, zlp) != 0) *err = B_TRUE; } else { *err = B_TRUE; } } } return (zlp); } /* * Search for any new pools, adding them to the list. We only add pools when no * options were given on the command line. Otherwise, we keep the list fixed as * those that were explicitly specified. */ void pool_list_update(zpool_list_t *zlp) { if (zlp->zl_findall) (void) zpool_iter(g_zfs, add_pool, zlp); } /* * Iterate over all pools in the list, executing the callback for each */ int pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, void *data) { zpool_node_t *node, *next_node; int ret = 0; for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) { next_node = uu_avl_next(zlp->zl_avl, node); if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL || unavail) ret |= func(node->zn_handle, data); } return (ret); } /* * Remove the given pool from the list. When running iostat, we want to remove * those pools that no longer exist. */ void pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) { zpool_node_t search, *node; search.zn_handle = zhp; if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { uu_avl_remove(zlp->zl_avl, node); zpool_close(node->zn_handle); free(node); } } /* * Free all the handles associated with this list. */ void pool_list_free(zpool_list_t *zlp) { uu_avl_walk_t *walk; zpool_node_t *node; if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) { (void) fprintf(stderr, gettext("internal error: out of memory")); exit(1); } while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(zlp->zl_avl, node); zpool_close(node->zn_handle); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(zlp->zl_avl); uu_avl_pool_destroy(zlp->zl_pool); free(zlp); } /* * Returns the number of elements in the pool list. */ int pool_list_count(zpool_list_t *zlp) { return (uu_avl_numnodes(zlp->zl_avl)); } /* * High level function which iterates over all pools given on the command line, * using the pool_list_* interfaces. */ int for_each_pool(int argc, char **argv, boolean_t unavail, zprop_list_t **proplist, boolean_t literal, zpool_iter_f func, void *data) { zpool_list_t *list; int ret = 0; if ((list = pool_list_get(argc, argv, proplist, literal, &ret)) == NULL) return (1); if (pool_list_iter(list, unavail, func, data) != 0) ret = 1; pool_list_free(list); return (ret); } -static int -for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func, - void *data) -{ - nvlist_t **child; - uint_t c, children; - int ret = 0; - int i; - char *type; - - const char *list[] = { - ZPOOL_CONFIG_SPARES, - ZPOOL_CONFIG_L2CACHE, - ZPOOL_CONFIG_CHILDREN - }; - - for (i = 0; i < ARRAY_SIZE(list); i++) { - if (nvlist_lookup_nvlist_array(nv, list[i], &child, - &children) == 0) { - for (c = 0; c < children; c++) { - uint64_t ishole = 0; - - (void) nvlist_lookup_uint64(child[c], - ZPOOL_CONFIG_IS_HOLE, &ishole); - - if (ishole) - continue; - - ret |= for_each_vdev_cb(zhp, child[c], func, - data); - } - } - } - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) - return (ret); - - /* Don't run our function on root vdevs */ - if (strcmp(type, VDEV_TYPE_ROOT) != 0) { - ret |= func(zhp, nv, data); - } - - return (ret); -} - /* * This is the equivalent of for_each_pool() for vdevs. It iterates thorough * all vdevs in the pool, ignoring root vdevs and holes, calling func() on * each one. * * @zhp: Zpool handle * @func: Function to call on each vdev * @data: Custom data to pass to the function */ int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) { nvlist_t *config, *nvroot = NULL; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); } - return (for_each_vdev_cb(zhp, nvroot, func, data)); + return (for_each_vdev_cb((void *) zhp, nvroot, func, data)); } /* * Process the vcdl->vdev_cmd_data[] array to figure out all the unique column * names and their widths. When this function is done, vcdl->uniq_cols, * vcdl->uniq_cols_cnt, and vcdl->uniq_cols_width will be filled in. */ static void process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl) { char **uniq_cols = NULL, **tmp = NULL; int *uniq_cols_width; vdev_cmd_data_t *data; int cnt = 0; int k; /* For each vdev */ for (int i = 0; i < vcdl->count; i++) { data = &vcdl->data[i]; /* For each column the vdev reported */ for (int j = 0; j < data->cols_cnt; j++) { /* Is this column in our list of unique column names? */ for (k = 0; k < cnt; k++) { if (strcmp(data->cols[j], uniq_cols[k]) == 0) break; /* yes it is */ } if (k == cnt) { /* No entry for column, add to list */ tmp = realloc(uniq_cols, sizeof (*uniq_cols) * (cnt + 1)); if (tmp == NULL) break; /* Nothing we can do... */ uniq_cols = tmp; uniq_cols[cnt] = data->cols[j]; cnt++; } } } /* * We now have a list of all the unique column names. Figure out the * max width of each column by looking at the column name and all its * values. */ uniq_cols_width = safe_malloc(sizeof (*uniq_cols_width) * cnt); for (int i = 0; i < cnt; i++) { /* Start off with the column title's width */ uniq_cols_width[i] = strlen(uniq_cols[i]); /* For each vdev */ for (int j = 0; j < vcdl->count; j++) { /* For each of the vdev's values in a column */ data = &vcdl->data[j]; for (k = 0; k < data->cols_cnt; k++) { /* Does this vdev have a value for this col? */ if (strcmp(data->cols[k], uniq_cols[i]) == 0) { /* Is the value width larger? */ uniq_cols_width[i] = MAX(uniq_cols_width[i], strlen(data->lines[k])); } } } } vcdl->uniq_cols = uniq_cols; vcdl->uniq_cols_cnt = cnt; vcdl->uniq_cols_width = uniq_cols_width; } /* * Process a line of command output * * When running 'zpool iostat|status -c' the lines of output can either be * in the form of: * * column_name=value * * Or just: * * value * * Process the column_name (if any) and value. * * Returns 0 if line was processed, and there are more lines can still be * processed. * * Returns 1 if this was the last line to process, or error. */ static int vdev_process_cmd_output(vdev_cmd_data_t *data, char *line) { char *col = NULL; char *val = line; char *equals; char **tmp; if (line == NULL) return (1); equals = strchr(line, '='); if (equals != NULL) { /* * We have a 'column=value' type line. Split it into the * column and value strings by turning the '=' into a '\0'. */ *equals = '\0'; col = line; val = equals + 1; } else { val = line; } /* Do we already have a column by this name? If so, skip it. */ if (col != NULL) { for (int i = 0; i < data->cols_cnt; i++) { if (strcmp(col, data->cols[i]) == 0) return (0); /* Duplicate, skip */ } } if (val != NULL) { tmp = realloc(data->lines, (data->lines_cnt + 1) * sizeof (*data->lines)); if (tmp == NULL) return (1); data->lines = tmp; data->lines[data->lines_cnt] = strdup(val); data->lines_cnt++; } if (col != NULL) { tmp = realloc(data->cols, (data->cols_cnt + 1) * sizeof (*data->cols)); if (tmp == NULL) return (1); data->cols = tmp; data->cols[data->cols_cnt] = strdup(col); data->cols_cnt++; } if (val != NULL && col == NULL) return (1); return (0); } /* * Run the cmd and store results in *data. */ static void vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) { int rc; char *argv[2] = {cmd, 0}; char *env[5] = {"PATH=/bin:/sbin:/usr/bin:/usr/sbin", NULL, NULL, NULL, NULL}; char **lines = NULL; int lines_cnt = 0; int i; /* Setup our custom environment variables */ rc = asprintf(&env[1], "VDEV_PATH=%s", data->path ? data->path : ""); if (rc == -1) { env[1] = NULL; goto out; } rc = asprintf(&env[2], "VDEV_UPATH=%s", data->upath ? data->upath : ""); if (rc == -1) { env[2] = NULL; goto out; } rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", data->vdev_enc_sysfs_path ? data->vdev_enc_sysfs_path : ""); if (rc == -1) { env[3] = NULL; goto out; } /* Run the command */ rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, &lines_cnt); if (rc != 0) goto out; /* Process the output we got */ for (i = 0; i < lines_cnt; i++) if (vdev_process_cmd_output(data, lines[i]) != 0) break; out: if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); /* Start with i = 1 since env[0] was statically allocated */ for (i = 1; i < ARRAY_SIZE(env); i++) free(env[i]); } /* * Generate the search path for zpool iostat/status -c scripts. * The string returned must be freed. */ char * zpool_get_cmd_search_path(void) { const char *env; char *sp = NULL; env = getenv("ZPOOL_SCRIPTS_PATH"); if (env != NULL) return (strdup(env)); env = getenv("HOME"); if (env != NULL) { if (asprintf(&sp, "%s/.zpool.d:%s", env, ZPOOL_SCRIPTS_DIR) != -1) { return (sp); } } if (asprintf(&sp, "%s", ZPOOL_SCRIPTS_DIR) != -1) return (sp); return (NULL); } /* Thread function run for each vdev */ static void vdev_run_cmd_thread(void *cb_cmd_data) { vdev_cmd_data_t *data = cb_cmd_data; char *cmd = NULL, *cmddup, *cmdrest; cmddup = strdup(data->cmd); if (cmddup == NULL) return; cmdrest = cmddup; while ((cmd = strtok_r(cmdrest, ",", &cmdrest))) { char *dir = NULL, *sp, *sprest; char fullpath[MAXPATHLEN]; if (strchr(cmd, '/') != NULL) continue; sp = zpool_get_cmd_search_path(); if (sp == NULL) continue; sprest = sp; while ((dir = strtok_r(sprest, ":", &sprest))) { if (snprintf(fullpath, sizeof (fullpath), "%s/%s", dir, cmd) == -1) continue; if (access(fullpath, X_OK) == 0) { vdev_run_cmd(data, fullpath); break; } } free(sp); } free(cmddup); } /* For each vdev in the pool run a command */ static int -for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) +for_each_vdev_run_cb(void *zhp_data, nvlist_t *nv, void *cb_vcdl) { vdev_cmd_data_list_t *vcdl = cb_vcdl; vdev_cmd_data_t *data; char *path = NULL; char *vname = NULL; char *vdev_enc_sysfs_path = NULL; int i, match = 0; + zpool_handle_t *zhp = zhp_data; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return (1); nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vdev_enc_sysfs_path); /* Spares show more than once if they're in use, so skip if exists */ for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) == 0) && (strcmp(vcdl->data[i].pool, zpool_get_name(zhp)) == 0)) { /* vdev already exists, skip it */ return (0); } } /* Check for selected vdevs here, if any */ for (i = 0; i < vcdl->vdev_names_count; i++) { vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags); if (strcmp(vcdl->vdev_names[i], vname) == 0) { free(vname); match = 1; break; /* match */ } free(vname); } /* If we selected vdevs, and this isn't one of them, then bail out */ if (!match && vcdl->vdev_names_count) return (0); /* * Resize our array and add in the new element. */ if (!(vcdl->data = realloc(vcdl->data, sizeof (*vcdl->data) * (vcdl->count + 1)))) return (ENOMEM); /* couldn't realloc */ data = &vcdl->data[vcdl->count]; data->pool = strdup(zpool_get_name(zhp)); data->path = strdup(path); data->upath = zfs_get_underlying_path(path); data->cmd = vcdl->cmd; data->lines = data->cols = NULL; data->lines_cnt = data->cols_cnt = 0; if (vdev_enc_sysfs_path) data->vdev_enc_sysfs_path = strdup(vdev_enc_sysfs_path); else data->vdev_enc_sysfs_path = NULL; vcdl->count++; return (0); } /* Get the names and count of the vdevs */ static int all_pools_for_each_vdev_gather_cb(zpool_handle_t *zhp, void *cb_vcdl) { return (for_each_vdev(zhp, for_each_vdev_run_cb, cb_vcdl)); } /* * Now that vcdl is populated with our complete list of vdevs, spawn * off the commands. */ static void all_pools_for_each_vdev_run_vcdl(vdev_cmd_data_list_t *vcdl) { tpool_t *t; t = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); if (t == NULL) return; /* Spawn off the command for each vdev */ for (int i = 0; i < vcdl->count; i++) { (void) tpool_dispatch(t, vdev_run_cmd_thread, (void *) &vcdl->data[i]); } /* Wait for threads to finish */ tpool_wait(t); tpool_destroy(t); } /* * Run command 'cmd' on all vdevs in all pools in argv. Saves the first line of * output from the command in vcdk->data[].line for all vdevs. If you want * to run the command on only certain vdevs, fill in g_zfs, vdev_names, * vdev_names_count, and cb_name_flags. Otherwise leave them as zero. * * Returns a vdev_cmd_data_list_t that must be freed with * free_vdev_cmd_data_list(); */ vdev_cmd_data_list_t * all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, int cb_name_flags) { vdev_cmd_data_list_t *vcdl; vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t)); vcdl->cmd = cmd; vcdl->vdev_names = vdev_names; vcdl->vdev_names_count = vdev_names_count; vcdl->cb_name_flags = cb_name_flags; vcdl->g_zfs = g_zfs; /* Gather our list of all vdevs in all pools */ for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, all_pools_for_each_vdev_gather_cb, vcdl); /* Run command on all vdevs in all pools */ all_pools_for_each_vdev_run_vcdl(vcdl); /* * vcdl->data[] now contains all the column names and values for each * vdev. We need to process that into a master list of unique column * names, and figure out the width of each column. */ process_unique_cmd_columns(vcdl); return (vcdl); } /* * Free the vdev_cmd_data_list_t created by all_pools_for_each_vdev_run() */ void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl) { free(vcdl->uniq_cols); free(vcdl->uniq_cols_width); for (int i = 0; i < vcdl->count; i++) { free(vcdl->data[i].path); free(vcdl->data[i].pool); free(vcdl->data[i].upath); for (int j = 0; j < vcdl->data[i].lines_cnt; j++) free(vcdl->data[i].lines[j]); free(vcdl->data[i].lines); for (int j = 0; j < vcdl->data[i].cols_cnt; j++) free(vcdl->data[i].cols[j]); free(vcdl->data[i].cols); free(vcdl->data[i].vdev_enc_sysfs_path); } free(vcdl->data); free(vcdl); } diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index bfef6fc43285..3a2caa9a8101 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -1,10773 +1,10775 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" libzfs_handle_t *g_zfs; static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); static int zpool_do_trim(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, HELP_TRIM, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_SYNC, HELP_REGUID, HELP_REOPEN, HELP_VERSION, HELP_WAIT } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, NULL}, }; /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "version", zpool_do_version, HELP_VERSION }, { NULL }, { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [-d [-w]] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [[[-c [script1,script2,...]" "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" " [[-n] interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-f] [-t] ...\n")); case HELP_ONLINE: return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: return (gettext("\tinitialize [-c | -s] [-w] " "[ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] [-w] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHf [pool] | -c]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); default: __builtin_unreachable(); } } static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { uint_t children = 0; nvlist_t **child; uint_t i; (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children); if (children == 0) { char *path = zpool_vdev_name(g_zfs, zhp, nvroot, VDEV_NAME_PATH); if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && strcmp(path, VDEV_TYPE_HOLE) != 0) fnvlist_add_boolean(res, path); free(path); return; } for (i = 0; i < children; i++) { zpool_collect_leaves(zhp, child[i], res); } } /* * Callback routine that will print out a pool property value. */ static int print_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " "appended with a feature name.\nSee zpool-features(7).\n")); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * zpool initialize [-c | -s] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. * -w Wait. Blocks until initializing has completed. */ int zpool_do_initialize(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; boolean_t wait = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_CANCEL; break; case 's': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_INITIALIZE_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); } else { for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } if (wait) err = zpool_initialize_wait(zhp, cmd_type, vdevs); else err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); return (err); } /* * print a pool vdev config for dry runs */ static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); return; } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole == B_TRUE) { continue; } (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) class = VDEV_ALLOC_BIAS_LOG; (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &class); if (strcmp(match, class) != 0) continue; if (!printed && name != NULL) { (void) printf("\t%*s%s\n", indent, "", name); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, "", name_flags); free(vname); } } /* * Print the list of l2cache devices for dry runs. */ static void print_cache_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "cache"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } /* * Print the list of spares for dry runs. */ static void print_spare_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "spares"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPOOL_PROP_INVAL; nvlist_t *proplist; const char *normnm; char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); const char *cname = zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } /* * if version is specified, only "legacy" compatibility * may be requested */ if ((prop == ZPOOL_PROP_COMPATIBILITY && strcmp(propval, ZPOOL_COMPAT_LEGACY) != 0 && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && nvlist_exists(proplist, cname) && strcmp(fnvlist_lookup_string(proplist, cname), ZPOOL_COMPAT_LEGACY) != 0)) { (void) fprintf(stderr, gettext("when 'version' is " "specified, the 'compatibility' feature may only " "be set to '" ZPOOL_COMPAT_LEGACY "'\n")); return (2); } if (zpool_prop_feature(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { zfs_prop_t fsprop = zfs_name_to_prop(propname); if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, B_FALSE)) { normnm = zfs_prop_to_name(fsprop); } else if (zfs_prop_user(propname) || zfs_prop_userquota(propname)) { normnm = propname; } else { (void) fprintf(stderr, gettext("property '%s' is " "not a valid filesystem property\n"), propname); return (2); } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by make_root_vdev(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, "fgLno:P")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child, **sparechild; uint_t l2children, sparechildren, c; char *vname; boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", name_flags | VDEV_NAME_TYPE_ID); print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", poolnvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", poolnvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } /* And finally the spares */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { hadspare = B_TRUE; (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { if (!hadspare) (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove [-npsw] ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; boolean_t stop = B_FALSE; int c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; boolean_t wait = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "npsw")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; if (wait) { (void) fprintf(stderr, gettext("invalid option " "combination: -w cannot be used with -s\n")); usage(B_FALSE); } } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], (unsigned long long)size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } if (ret == 0 && wait) ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); } zpool_close(zhp); return (ret); } /* * Return 1 if a vdev is active (being used in a pool) * Return 0 if a vdev is inactive (offlined or faulted, or not in active pool) * * This is useful for checking if a disk in an active pool is offlined or * faulted. */ static int vdev_is_active(char *vdev_path) { int fd; fd = open(vdev_path, O_EXCL); if (fd < 0) { return (1); /* cant open O_EXCL - disk is active */ } close(fd); return (0); /* disk is inactive in the pool */ } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; struct stat st; int c, fd = -1, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(vdev, argv[0], sizeof (vdev)); if (vdev[0] != '/' && stat(vdev, &st) != 0) { int error; error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(vdev)) { if (zfs_append_partition(vdev, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat(vdev, &st) != 0)) { (void) fprintf(stderr, gettext( "failed to find device %s, try specifying absolute " "path instead\n"), argv[0]); return (1); } } if ((fd = open(vdev, O_RDWR)) < 0) { (void) fprintf(stderr, gettext("failed to open %s: %s\n"), vdev, strerror(errno)); return (1); } /* * Flush all dirty pages for the block device. This should not be * fatal when the device does not support BLKFLSBUF as would be the * case for a file vdev. */ if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); if (zpool_read_label(fd, &config, NULL) != 0) { (void) fprintf(stderr, gettext("failed to read label from %s\n"), vdev); ret = 1; goto errout; } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: /* * We allow the user to call 'zpool offline -f' * on an offlined disk in an active pool. We can check if * the disk is online by calling vdev_is_active(). */ if (force && !vdev_is_active(vdev)) break; (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\""), vdev, zpool_pool_state_to_name(state), name); if (force) { (void) fprintf(stderr, gettext( ". Offline the disk first to clear its label.")); } printf("\n"); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. * Once we get the nvlist back from make_root_vdev(), we either print out the * contents (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_pool_features = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *compat = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_pool_features = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_pool_features = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; if (zpool_name_to_prop(optarg) == ZPOOL_PROP_COMPATIBILITY) compat = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to make_root_vdev for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); print_vdev_tree(NULL, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); print_cache_list(nvroot, 0); print_spare_list(nvroot, 0); ret = 0; } else { /* * Load in feature set. * Note: if compatibility property not given, we'll have * NULL, which means 'all features'. */ boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) goto errout; /* * props contains list of features to enable. * For each feature: * - remove it if feature@name=disabled * - leave it there if feature@name=enabled * - add it if: * - enable_pool_features (ie: no '-d' or '-o version') * - it's supported by the kernel module * - it's in the requested feature set * - warn if it's enabled but not in compat */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) (void) nvlist_remove_all(props, propname); if (strcmp(propval, ZFS_FEATURE_ENABLED) == 0 && !requested_features[i]) (void) fprintf(stderr, gettext( "Warning: feature \"%s\" enabled " "but is not in specified " "'compatibility' feature set.\n"), feat->fi_uname); } else if ( enable_pool_features && feat->fi_zfs_mod_supported && requested_features[i]) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) { ret = zfs_shareall(pool); zfs_commit_all_shares(); } zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { boolean_t force; boolean_t hardforce; } export_cbdata_t; /* * Export one pool */ static int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; if (zpool_disable_datasets(zhp, cb->force) != 0) return (1); /* The history must be logged as part of the export */ log_history = B_FALSE; if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; argc -= optind; argv += optind; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, &cb)); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { char *name; nvlist_t **child; uint_t c, children; int ret; name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); if (strlen(name) + depth > max) max = strlen(name) + depth; free(name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ static int is_blank_str(char *str) { while (str != NULL && *str != '\0') { if (!isblank(*str)) return (0); str++; } return (1); } /* Print command output lines for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) { vdev_cmd_data_t *data; int i, j; char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) { /* Not the vdev we're looking for */ continue; } data = &vcdl->data[i]; /* Print out all the output values for this vdev */ for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; /* Does this vdev have values for this column? */ for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { /* yes it does, record the value */ val = data->lines[k]; break; } } /* * Mark empty values with dashes to make output * awk-able. */ if (val == NULL || is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); if (j < vcdl->uniq_cols_cnt - 1) printf(" "); } /* Print out any values that aren't in a column at the end */ for (j = data->cols_cnt; j < data->lines_cnt; j++) { /* Did we have any columns? If so print a spacer. */ if (vcdl->uniq_cols_cnt > 0) printf(" "); val = data->lines[j]; printf("%s", val ? val : ""); } break; } } /* * Print vdev initialization status for leaves */ static void print_status_initialize(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { initialize_pct = (vs->vs_initialize_bytes_done * 100 / (vs->vs_initialize_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_INITIALIZE_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_INITIALIZE_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% initialized%s)"), initialize_pct, zbuf); } else { (void) printf(gettext(" (uninitialized)")); } } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) printf(gettext(" (initializing)")); } } /* * Print vdev TRIM status for leaves */ static void print_status_trim(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || vs->vs_trim_state == VDEV_TRIM_SUSPENDED || vs->vs_trim_state == VDEV_TRIM_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { trim_pct = (vs->vs_trim_bytes_done * 100 / (vs->vs_trim_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_TRIM_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_TRIM_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% trimmed%s)"), trim_pct, zbuf); } else if (vs->vs_trim_notsup) { (void) printf(gettext(" (trim unsupported)")); } else { (void) printf(gettext(" (untrimmed)")); } } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { (void) printf(gettext(" (trimming)")); } } /* * Return the color associated with a health string. This includes returning * NULL for no color change. */ static char * health_str_to_color(const char *health) { if (strcmp(health, gettext("FAULTED")) == 0 || strcmp(health, gettext("SUSPENDED")) == 0 || strcmp(health, gettext("UNAVAIL")) == 0) { return (ANSI_RED); } if (strcmp(health, gettext("OFFLINE")) == 0 || strcmp(health, gettext("DEGRADED")) == 0 || strcmp(health, gettext("REMOVED")) == 0) { return (ANSI_YELLOW); } return (NULL); } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; const char *state; char *type; char *path = NULL; char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = gettext("INUSE"); else if (vs->vs_state == VDEV_STATE_HEALTHY) state = gettext("AVAIL"); } printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { if (vs->vs_read_errors) rcolor = ANSI_RED; if (vs->vs_write_errors) wcolor = ANSI_RED; if (vs->vs_checksum_errors) ccolor = ANSI_RED; if (cb->cb_literal) { printf(" "); printf_color(rcolor, "%5llu", (u_longlong_t)vs->vs_read_errors); printf(" "); printf_color(wcolor, "%5llu", (u_longlong_t)vs->vs_write_errors); printf(" "); printf_color(ccolor, "%5llu", (u_longlong_t)vs->vs_checksum_errors); } else { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); printf(" "); printf_color(rcolor, "%5s", rbuf); printf(" "); printf_color(wcolor, "%5s", wbuf); printf(" "); printf_color(ccolor, "%5s", cbuf); } if (cb->cb_print_slow_ios) { if (children == 0) { /* Only leafs vdevs have slow IOs */ zfs_nicenum(vs->vs_slow_ios, rbuf, sizeof (rbuf)); } else { snprintf(rbuf, sizeof (rbuf), "-"); } if (cb->cb_literal) printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); else printf(" %5s", rbuf); } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" %s %s", gettext("was"), path); } else if (vs->vs_aux != 0) { (void) printf(" "); color_start(ANSI_RED); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ASHIFT_TOO_BIG: (void) printf(gettext("unsupported minimum blocksize")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } color_end(); } else if (children == 0 && !isspare && getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift) { (void) printf( gettext(" block size: %dB configured, %dB native"), 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } else if (vs->vs_resilver_deferred) { (void) printf(gettext(" (awaiting resilver)")); } } /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && children == 0) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; /* Only print normal classes here */ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; /* Provide vdev_rebuild_stats to children if available */ if (vrs == NULL) { (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare, vrs); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; char *type, *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print specialized class vdevs. * * These are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1 or an "alloc_bias" string. We use either * print_status_config() or print_import_config() to print the top level * class vdevs then any of their children (eg mirrored slogs) are printed * recursively - which works because only the top level vdev is marked. */ static void print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class) { uint_t c, children; nvlist_t **child; boolean_t printed = B_FALSE; assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { (void) printf("\t%s\t\n", gettext(class)); printed = B_TRUE; } char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ static int show_import(nvlist_t *config, boolean_t report_error) { uint64_t pool_state; vdev_stat_t *vs; char *name; uint64_t guid; uint64_t hostid = 0; char *msgid; char *hostname = "unknown"; nvlist_t *nvroot, *nvinfo; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; char *comment; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); /* * If we're importing using a cachefile, then we won't report any * errors unless we are in the scan phase of the import. */ if (reason != ZPOOL_STATUS_OK && !report_error) return (reason); (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices contains" " corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported " "features are not enabled on the pool.\n\t" "(Note that they may be intentionally disabled " "if the\n\t'compatibility' property is set.)\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" "property.\n")); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n" "requested by the 'compatibility' property.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has the " "multihost property on. It cannot\n\tbe safely imported " "when the system hostid is not set.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric\n\tidentifier, " "though the file(s) indicated by its " "'compatibility'\n\tproperty cannot be parsed at " "this time.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext(" action: The pool can " "be imported using its name or numeric " "identifier,\n\thowever there is a compat" "ibility issue which should be corrected" "\n\tby running 'zpool scrub'\n")); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext(" action: The pool can" "not be imported with this version of ZFS " "due to\n\tan active asynchronous destroy. " "Revert to an earlier version\n\tand " "allow the destroy to complete before " "updating.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted datasets contain an on-disk " "incompatibility, which\n\tneeds to be " "corrected. Backup these datasets to new " "encrypted datasets\n\tand destroy the " "old ones.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted snapshots and bookmarks contain " "an on-disk\n\tincompatibility. This may " "cause on-disk corruption if they are used" "\n\twith 'zfs recv'. To correct the " "issue, enable the bookmark_v2 feature.\n\t" "No additional action is needed if there " "are no encrypted snapshots or\n\t" "bookmarks. If preserving the encrypted " "snapshots and bookmarks is\n\trequired, " "use a non-raw send to backup and restore " "them. Alternately,\n\tthey may be removed" " to resolve the incompatibility.\n")); break; default: /* * All errata must contain an action message. */ assert(0); } } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) printf(gettext(" action: The pool must be " "exported from %s (hostid=%lx)\n\tbefore it " "can be safely imported.\n"), hostname, (unsigned long) hostid); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) { (void) printf(gettext( " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } (void) printf(gettext(" config:\n\n")); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } return (0); } static boolean_t zfs_force_import_required(nvlist_t *config) { uint64_t state; uint64_t hostid = 0; nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state != MMP_STATE_INACTIVE) return (B_TRUE); } return (B_FALSE); } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; zpool_handle_t *zhp; char *name; uint64_t version; name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (zfs_force_import_required(config) && !(flags & ZFS_IMPORT_ANY_HOST)) { mmp_state_t mmp_state = MMP_STATE_INACTIVE; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) { char *hostname = ""; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool is imported on %s (hostid: " "0x%lx)\nExport the pool on the other system, " "then run 'zpool import'.\n"), name, hostname, (unsigned long) hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) fprintf(stderr, gettext("Cannot import '%s': " "pool has the multihost property on and the\n" "system's hostid is not set. Set a unique hostid " "with the zgenhostid(8) command.\n"), name); } else { char *hostname = ""; uint64_t timestamp = 0; uint64_t hostid = 0; if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool was previously in use from another system.\n" "Last accessed by %s (hostid=%lx) at %s" "The pool can be imported, use 'zpool import -f' " "to import the pool.\n"), name, hostname, (unsigned long)hostid, ctime((time_t *)×tamp)); } return (1); } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = (char *)newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); /* * Loading keys is best effort. We don't want to return immediately * if it fails but we do want to give the error to the caller. */ if (flags & ZFS_IMPORT_LOAD_KEYS) { ret = zfs_crypto_attempt_load_keys(g_zfs, name); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } zpool_close(zhp); return (ret); } static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, char *orig_name, char *new_name, boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ int err = 0; nvpair_t *elem = NULL; boolean_t first = B_TRUE; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, import->policy) == 0); if (!pool_specified) { if (first) first = B_FALSE; else if (!do_all) (void) printf("\n"); if (do_all) { err |= do_import(config, NULL, mntopts, props, flags); } else { /* * If we're importing from cachefile, then * we don't want to report errors until we * are in the scan phase of the import. If * we get an error, then we return that error * to invoke the scan phase. */ if (import->cachefile && !import->scan) err = show_import(config, B_FALSE); else (void) show_import(config, B_TRUE); } } else if (import->poolname != NULL) { char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, import->poolname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), import->poolname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == import->guid) found_config = config; } } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (pool_specified && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), orig_name); err = B_TRUE; } else { err |= do_import(found_config, new_name, mntopts, props, flags); } } /* * If we were just looking for pools, report an error if none were * found. */ if (!pool_specified && first) (void) fprintf(stderr, gettext("no pools available to import\n")); return (err); } typedef struct target_exists_args { const char *poolname; uint64_t poolguid; } target_exists_args_t; static int name_or_guid_exists(zpool_handle_t *zhp, void *data) { target_exists_args_t *args = data; nvlist_t *config = zpool_get_config(zhp, NULL); int found = 0; if (config == NULL) return (0); if (args->poolname != NULL) { char *pool_name; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); if (strcmp(pool_name, args->poolname) == 0) found = 1; } else { uint64_t pool_guid; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); if (pool_guid == args->poolguid) found = 1; } zpool_close(zhp); return (found); } /* * zpool checkpoint * checkpoint --discard * * -d Discard the checkpoint from a checkpointed * --discard pool. * * -w Wait for discarding a checkpoint to complete. * --wait * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard, wait; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; discard = B_FALSE; wait = B_FALSE; while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (wait && !discard) { (void) fprintf(stderr, gettext("--wait only valid when " "--discard also specified\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) { err = (zpool_discard_checkpoint(zhp) != 0); if (err == 0 && wait) err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); } else { err = (zpool_checkpoint(zhp) != 0); } zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] [-n] [-F] * [newpool] * * -c Read pool information from a cachefile instead of searching * devices. If importing from a cachefile config fails, then * fallback to searching for devices only in the directories that * exist in the cachefile. * * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only * specified destroyed pools. * * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * * -f Force import, even if it appears that the pool is active. * * -F Attempt rewind if necessary. * * -n See if rewind would work, but don't actually rewind. * * -N Import the pool but don't mount datasets. * * -T Specify a starting txg to use for import. This option is * intentionally undocumented option for testing purposes. * * -a Import all pools found. * * -l Load encryption keys while importing. * * -o Set property=value and/or temporary mount options (without '='). * * -s Scan using the default search path, the libblkid cache will * not be consulted. * * --rewind-to-checkpoint * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *policy = NULL; nvlist_t *props = NULL; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; boolean_t pool_specified = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'l': flags |= ZFS_IMPORT_LOAD_KEYS; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if (cachefile && do_scan) { (void) fprintf(stderr, gettext("-c is incompatible with -s\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { (void) fprintf(stderr, gettext("-l is only meaningful during " "an import\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); if (searchdirs != NULL) free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. */ target_exists_args_t search = {searchname, searchguid}; pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir, *tmp = NULL; envdup = strdup(env); for (dir = strtok_r(envdup, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) { searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = dir; } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { free(searchdirs); free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, do_destroyed, pool_specified, do_all, &idata); /* * If we're using the cachefile and we failed to import, then * fallback to scanning the directory for pools that match * those in the cachefile. */ if (err != 0 && cachefile != NULL) { (void) printf(gettext("cachefile import failed, retrying\n")); /* * We use the scan flag to gather the directories that exist * in the cachefile. If we need to fallback to searching for * the pool config, we will only search devices in these * directories. */ idata.scan = B_TRUE; nvlist_free(pools); pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, do_destroyed, pool_specified, do_all, &idata); } error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); free(searchdirs); free(envdup); return (err ? 1 : 0); } /* * zpool sync [-f] [pool] ... * * -f (undocumented) force uberblock (and config including zpool cache file) * update. * * Sync the specified pool(s). * Without arguments "zpool sync" will sync all pools. * This command initiates TXG sync(s) and will return after the TXG(s) commit. * */ static int zpool_do_sync(int argc, char **argv) { int ret; boolean_t force = B_FALSE; /* check options */ while ((ret = getopt(argc, argv, "f")) != -1) { switch (ret) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, zpool_sync_one, &force); return (ret); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; char **cb_vdev_names; /* Only show these vdevs */ unsigned int cb_vdev_names_count; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 15 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {"rebuild", 1}, {NULL}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {"trimq_write", 2}, {"rebuildq_write", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {"trim", 2}, {"rebuild", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {"rebuild"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ [IOS_RQ_HISTO] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ static void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width) / columns - slen / columns); if (text_start < 0) text_start = 0; printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; if (spaces_to_end < 0) spaces_to_end = 0; for (s = 0; s < spaces_to_end; s++) printf(" "); } } } /* * print_cmd_columns - Print custom column titles from -c * * If the user specified the "zpool status|iostat -c" then print their custom * column titles in the header. For example, print_cmd_columns() would print * the " col1 col2" part of this: * * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' * ... * capacity operations bandwidth * pool alloc free read write read write col1 col2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- * mypool 269K 1008M 0 0 107 946 * mirror 269K 1008M 0 0 107 946 * sdb - - 0 0 102 473 val1 val2 * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ static void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; vdev_cmd_data_t *data = &vcdl->data[0]; if (vcdl->count == 0 || data == NULL) return; /* * Each vdev cmd should have the same column names unless the user did * something weird with their cmd. Just take the column names from the * first vdev and assume it works for all of them. */ for (i = 0; i < vcdl->uniq_cols_cnt; i++) { printf(" "); if (use_dashes) { for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("\n"); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 0); printf("\n"); print_iostat_separator_impl(cb, force_column_width); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else printf(" %*s", column_size, buf); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t stat_histo_max(struct stat_array *nva, unsigned int len) { uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, struct stat_array *nva) { nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (i = 0; i < ARRAY_SIZE(names); i++) { val = nva[i].data[0]; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAWTIME; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count); print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } static const char *class_name[] = { VDEV_ALLOC_BIAS_DEDUP, VDEV_ALLOC_BIAS_SPECIAL, VDEV_ALLOC_CLASS_LOGS }; /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children, oldchildren; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdev_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdev_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, oldnv, newnv); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } /* * print normal top-level devices */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * print all other top-level devices */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, class_name[n]); } printf("\n"); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, "cache"); } printf("\n"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ static int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { print_iostat_separator(cb); if (cb->vcdl != NULL) { print_cmd_columns(cb->vcdl, 1); } printf("\n"); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } /* * Return the required length of the pool/vdev name column. The minimum * allowed width and output formatting flags must be provided. */ static int get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) { nvlist_t *config, *nvroot; int width = min_width; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); unsigned int poolname_len = strlen(zpool_get_name(zhp)); if (verbose == B_FALSE) { width = MAX(poolname_len, min_width); } else { width = MAX(poolname_len, max_width(zhp, nvroot, 0, min_width, flags)); } } return (width); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. */ static int -is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) +is_vdev_cb(void *zhp_data, nvlist_t *nv, void *cb_data) { iostat_cbdata_t *cb = cb_data; char *name = NULL; int ret = 0; + zpool_handle_t *zhp = zhp_data; name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); if (strcmp(name, cb->cb_vdev_names[0]) == 0) ret = 1; /* match */ free(name); return (ret); } /* * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_vdev_names for a second... */ tmp_name = cb->cb_vdev_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_vdev_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_vdev_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, B_FALSE, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int are_all_pools(int argc, char **argv) { if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Floating point sleep(). Allows you to pass in a floating point value for * seconds. */ static void fsleep(float sec) { struct timespec req; req.tv_sec = floor(sec); req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; nanosleep(&req, NULL); } /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. */ static int terminal_height(void) { struct winsize win; if (isatty(STDOUT_FILENO) == 0) return (-1); if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) return (win.ws_row); return (-1); } /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. * * name: Short name of the script ('iostat'). * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); */ static void print_zpool_script_help(char *name, char *path) { char *argv[] = {path, "-h", NULL}; char **lines = NULL; int lines_cnt = 0; int rc; rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, &lines_cnt); if (rc != 0 || lines == NULL || lines_cnt <= 0) { if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); return; } for (int i = 0; i < lines_cnt; i++) if (!is_blank_str(lines[i])) printf(" %-14s %s\n", name, lines[i]); libzfs_free_str_array(lines, lines_cnt); } /* * Go though the zpool status/iostat -c scripts in the user's path, run their * help option (-h), and print out the results. */ static void print_zpool_dir_scripts(char *dirpath) { DIR *dir; struct dirent *ent; char fullpath[MAXPATHLEN]; struct stat dir_stat; if ((dir = opendir(dirpath)) != NULL) { /* print all the files and directories within directory */ while ((ent = readdir(dir)) != NULL) { sprintf(fullpath, "%s/%s", dirpath, ent->d_name); /* Print the scripts */ if (stat(fullpath, &dir_stat) == 0) if (dir_stat.st_mode & S_IXUSR && S_ISREG(dir_stat.st_mode)) print_zpool_script_help(ent->d_name, fullpath); } closedir(dir); } } /* * Print out help text for all zpool status/iostat -c scripts. */ static void print_zpool_script_list(char *subcommand) { char *dir, *sp, *tmp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); sp = zpool_get_cmd_search_path(); if (sp == NULL) return; for (dir = strtok_r(sp, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) print_zpool_dir_scripts(dir); free(sp); } /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; int width, available_width; /* * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the * padding width for names that are less than maximum width. The stats * take up 42 characters, so the width available for names is: */ available_width = get_columns() - 42; /* * If the maximum width fits on a screen, then great! Make everything * line up by justifying all lines to the same width. If that max * width is larger than what's available, the name plus stats won't fit * on one line, and justifying to that width would cause every line to * wrap on the screen. We only want lines with long names to wrap. * Limit the padding to what won't wrap. */ if (width > available_width) width = available_width; /* * And regardless of whatever the screen width is (get_columns can * return 0 if the width is not known or less than 42 for a narrow * terminal) have the width be a minimum of 10. */ if (width < 10) width = 10; /* Save the calculated width */ cb->cb_namewidth = width; return (0); } /* * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * -n Only print headers once * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; boolean_t headers_once = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; verbose = B_TRUE; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'n': headers_once = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { print_zpool_script_list("iostat"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { /* All the args are vdevs */ cb.cb_vdev_names = argv; cb.cb_vdev_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { /* ...and the rest are vdev names */ cb.cb_vdev_names = argv + 1; cb.cb_vdev_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdev_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_iostat, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL && cb.cb_verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) { cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdev_names, cb.cb_vdev_names_count, cb.cb_name_flags); } else { cb.vcdl = NULL; } /* * Check terminal size so we can print headers * even when terminal window has its height * changed. */ winheight = terminal_height(); /* * Are we connected to TTY? If not, headers_once * should be true, to avoid breaking scripts. */ if (winheight < 0) headers_once = B_TRUE; /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose) || (!headers_once && (cb.cb_iteration % winheight) == 0)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fsleep(interval); continue; } pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdev_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); if (cb.vcdl != NULL) print_cmd_columns(cb.vcdl, 1); printf("\n"); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); } /* * Flush the output so that redirection to a file isn't buffered * indefinitely. */ (void) fflush(stdout); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) printf(" "); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) printf("\n"); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } (void) printf("\n"); } static void print_one_column(zpool_prop_t prop, uint64_t value, const char *str, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value / 100); else (void) snprintf(propval, sizeof (propval), value < 1000 ? "%1.2f%%" : value < 10000 ? "%2.1f%%" : "%3.0f%%", value / 100.0); break; case ZPOOL_PROP_HEALTH: width = 8; (void) strlcpy(propval, str, sizeof (propval)); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } /* * print static default line per vdev * not compatible with '-o' option */ static void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth, boolean_t isspare) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; char *dashes = "%-*s - - - - " "- - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; const char *state; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, scripted, B_TRUE, format); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, NULL, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL, scripted, toplevel, format); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted, B_TRUE, format); (void) printf("\n"); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } /* list the classes: 'logs', 'dedup', and 'special' */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { char *bias = NULL; char *type = NULL; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, class_name[n]); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_TRUE); free(vname); } } } /* * Generic callback function to list a pool. */ static int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; print_pool(zhp, cbp); if (cbp->cb_verbose) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE); } return (0); } /* * Set the minimum pool/vdev name column width. The width must be at least 9, * but may be as large as needed. */ static int get_namewidth_list(zpool_handle_t *zhp, void *data) { list_cbdata_t *cb = data; int width; width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); if (width < 9) width = 9; cb->cb_namewidth = width; return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 's': rebuild = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); if (ret == 0 && wait) ret = zpool_wait(zhp, replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * * Replace with . */ /* ARGSUSED */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering to complete before returning * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of * and . In either case, will begin life * with a DTL of [0, now], and will immediately begin to resilver itself. */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ /* ARGSUSED */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * -l Load encryption keys while importing. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'l': loadkeys = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } if (!flags.import && loadkeys) { (void) fprintf(stderr, gettext("loading keys is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); print_vdev_tree(NULL, "dedup", config, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", config, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (loadkeys) { ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { ret = 1; (void) fprintf(stderr, gettext("Split was successful, but " "the datasets could not all be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } /* * zpool online ... */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "e")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft] ... * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. */ /* ARGSUSED */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "ft")) != -1) { switch (c) { case 'f': fault = B_TRUE; break; case 't': istmp = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (fault) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ aux = VDEV_AUX_EXTERNAL_PERSIST; } else { aux = VDEV_AUX_EXTERNAL; } if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) ret = 1; } else { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool clear [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ while ((c = getopt(argc, argv, "FnX")) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; boolean_t scrub_restart = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "n")) != -1) { switch (c) { case 'n': scrub_restart = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); } typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } static int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scan '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } static int wait_callback(zpool_handle_t *zhp, void *data) { zpool_wait_activity_t *act = data; return (zpool_wait(zhp, *act)); } /* * zpool scrub [-s | -p] [-w] ... * * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; boolean_t wait = B_FALSE; int error; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "spw")) != -1) { switch (c) { case 's': cb.cb_type = POOL_SCAN_NONE; break; case 'p': cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (cb.cb_type == POOL_SCAN_NONE && cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { (void) fprintf(stderr, gettext("invalid option combination: " "-s and -p are mutually exclusive\n")); usage(B_FALSE); } if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " "-w cannot be used with -p or -s\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, wait_callback, &act); } return (error); } /* * zpool resilver ... * * Restarts any in-progress resilver */ int zpool_do_resilver(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, scrub_callback, &cb)); } /* * zpool trim [-d] [-r ] [-c | -s] [ ...] * * -c Cancel. Ends any in-progress trim. * -d Secure trim. Requires kernel and device support. * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. * -s Suspend. TRIM can then be restarted with no flags. * -w Wait. Blocks until trimming has completed. */ int zpool_do_trim(int argc, char **argv) { struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"secure", no_argument, NULL, 'd'}, {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_trim_func_t cmd_type = POOL_TRIM_START; uint64_t rate = 0; boolean_t secure = B_FALSE; boolean_t wait = B_FALSE; int c; while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_CANCEL; break; case 'd': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-d cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } secure = B_TRUE; break; case 'r': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-r cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } - if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) { - (void) fprintf(stderr, - gettext("invalid value for rate\n")); + if (zfs_nicestrtonum(g_zfs, optarg, &rate) == -1) { + (void) fprintf(stderr, "%s: %s\n", + gettext("invalid value for rate"), + libzfs_error_description(g_zfs)); usage(B_FALSE); } break; case 's': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_TRIM_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } char *poolname = argv[0]; zpool_handle_t *zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); trimflags_t trim_flags = { .secure = secure, .rate = rate, .wait = wait, }; nvlist_t *vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); trim_flags.fullpool = B_TRUE; } else { trim_flags.fullpool = B_FALSE; for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); fnvlist_free(vdevs); zpool_close(zhp); return (error); } /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. */ static void secs_to_dhms(uint64_t total, char *buf) { uint64_t days = total / 60 / 60 / 24; uint64_t hours = (total / 60 / 60) % 24; uint64_t mins = (total / 60) % 60; uint64_t secs = (total % 60); if (days > 0) { (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", (u_longlong_t)days, (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } else { (void) sprintf(buf, "%02llu:%02llu:%02llu", (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } } /* * Print out detailed scrub status. */ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t pass_scanned, scanned, pass_issued, issued, total; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; char srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ if (ps->pss_func == POOL_SCAN_SCRUB) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } scanned = ps->pss_examined; pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; total = ps->pss_to_examine; /* we are only done with a block once we have issued the IO for it */ fraction_done = (double)issued / total; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = (elapsed != 0) ? elapsed : 1; scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? ((total - issued) / issue_rate) : UINT64_MAX; secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); zfs_nicebytes(total, total_buf, sizeof (total_buf)); zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); /* do not print estimated time if we have a paused scrub */ if (pause == 0) { (void) printf(gettext("\t%s scanned at %s/s, " "%s issued at %s/s, %s total\n"), scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); } else { (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), scanned_buf, issued_buf, total_buf); } if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { if (total_secs_left != UINT64_MAX && issue_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } static void print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; uint64_t bytes_est = vrs->vrs_bytes_est; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / (bytes_est + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; char bytes_rebuilt_buf[7], bytes_est_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); zfs_nicebytes(bytes_issued, bytes_issued_buf, sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; /* Rebuild is finished or canceled. */ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); (void) printf(gettext("resilvered (%s) %s in %s " "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { (void) printf(gettext("resilver (%s) canceled on %s"), vdev_name, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { (void) printf(gettext("resilver (%s) in progress since %s"), vdev_name, ctime(&start)); } assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / MAX(scan_rate, 1), time_buf); (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " "%s total\n"), bytes_scanned_buf, scan_rate_buf, bytes_issued_buf, issue_rate_buf, bytes_est_buf); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { if (scan_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } /* * Print rebuild status for top-level vdevs. */ static void print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); print_rebuild_status_impl(vrs, name); free(name); } } } /* * As we don't scrub checkpointed blocks, we want to warn the user that we * skipped scanning some blocks if a checkpoint exists or existed at any * time during the scan. If a sequential instead of healing reconstruction * was performed then the blocks were reconstructed. However, their checksums * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Returns B_TRUE if there is an active rebuild in progress. Otherwise, * B_FALSE is returned and 'rebuild_end_time' is set to the end time for * the last completed (or cancelled) rebuild. */ static boolean_t check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) { nvlist_t **child; uint_t children; boolean_t rebuilding = B_FALSE; uint64_t end_time = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { if (vrs->vrs_end_time > end_time) end_time = vrs->vrs_end_time; if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { rebuilding = B_TRUE; end_time = 0; break; } } } if (rebuild_end_time != NULL) *rebuild_end_time = end_time; return (rebuilding); } /* * Print the scan status. */ static void print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { if (ps->pss_func == POOL_SCAN_RESILVER) { resilver_end_time = ps->pss_end_time; active_resilver = (ps->pss_state == DSS_SCANNING); } have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ if (have_scrub) print_scan_scrub_resilver_status(ps); /* * When there is an active resilver or rebuild print its status. * Otherwise print the status of the last resilver or rebuild. */ if (active_resilver || (!active_rebuild && have_resilver && resilver_end_time && resilver_end_time > rebuild_end_time)) { print_scan_scrub_resilver_status(ps); } else if (active_rebuild || (!active_resilver && have_rebuild && rebuild_end_time && rebuild_end_time > resilver_end_time)) { print_rebuild_status(zhp, nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_scan_warning(ps, pcs); } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext( "\t%s copied out of %s at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vdev_name); if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); (void) printf(gettext( "\t%s memory used for removed device mappings\n"), mem_buf); } } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), (int)(strlen(date) - 1), date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE, NULL); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; char dspace[6], mspace[6]; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); (void) printf("DDT entries %llu, size %s on disk, %s in core\n", (u_longlong_t)ddo->ddo_count, dspace, mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ static int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED || reason == ZPOOL_STATUS_COMPATIBILITY_ERR || reason == ZPOOL_STATUS_INCOMPATIBLE_FEAT)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_get_state_str(zhp); printf(" "); printf_color(ANSI_BOLD, gettext("pool:")); printf(" %s\n", zpool_get_name(zhp)); printf(" "); printf_color(ANSI_BOLD, gettext("state: ")); printf_color(health_str_to_color(health), "%s", health); printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. Sufficient replicas exist for\n\tthe pool " "to continue functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. There are insufficient\n\treplicas for the" " pool to continue functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Determine if the " "device needs to be replaced, and clear the errors\n\tusing" " 'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using 'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_REBUILD_SCRUB: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices have " "been sequentially resilvered, scrubbing\n\tthe pool " "is recommended.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " "verify all data checksums.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Restore the file in question" " if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " "'zpool upgrade'. Once this is done, the\n\tpool will no " "longer be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " "to a newer, incompatible on-disk version.\n\tThe pool " "cannot be accessed on this system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system running more recent software, or\n\trestore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported and " "requested features are not enabled on the pool.\n\t" "The pool can still be used, but some features are " "unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(7) for details.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("This pool has a " "compatibility list specified, but it could not be\n\t" "read/parsed at this time. The pool can still be used, " "but this\n\tshould be investigated.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Check the value of the " "'compatibility' property against the\n\t" "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n\t" "requested by the 'compatibility' property.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Consider setting " "'compatibility' to an appropriate value, or\n\t" "adding needed features to the relevant file in\n\t" ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "on this system because it uses the\n\tfollowing feature(s)" " not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system that supports the required feature(s),\n\tor " "restore the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "in read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " "pool from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is suspended " "because multihost writes failed or were delayed;\n\t" "another system could import the pool undetected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " "pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to IO failures.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the affected " "devices are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: (void) printf(gettext("status: One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); (void) printf(gettext("action: Replace affected devices with " "devices that support the\n\tconfigured block size, or " "migrate data to a properly configured\n\tpool.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" " and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Export this pool on all " "systems on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("\tExisting encrypted datasets " "contain an on-disk incompatibility\n\twhich " "needs to be corrected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext("\tExisting encrypted snapshots " "and bookmarks contain an on-disk\n\tincompat" "ibility. This may cause on-disk corruption if " "they are used\n\twith 'zfs recv'.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the" "issue, enable the bookmark_v2 feature. No " "additional\n\taction is needed if there are no " "encrypted snapshots or bookmarks.\n\tIf preserving" "the encrypted snapshots and bookmarks is required," " use\n\ta non-raw send to backup and restore them." " Alternately, they may be\n\tremoved to resolve " "the incompatibility.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) { printf(" "); printf_color(ANSI_BOLD, gettext("see:")); printf(gettext( " https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; color_start(ANSI_BOLD); (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); color_end(); if (cbp->cb_print_slow_ios) { printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { nvlist_t *nverrlist = NULL; /* * If the approximate error count is small, get a * precise count by fetching the entire log and * uniquifying the results. */ if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && zpool_get_errlog(zhp, &nverrlist) == 0) { nvpair_t *elem; elem = NULL; nerr = 0; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nerr++; } } nvlist_free(nverrlist); (void) printf("\n"); if (nerr == 0) (void) printf(gettext("errors: No known data " "errors\n")); else if (!cbp->cb_verbose) (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); else print_error_log(zhp); } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... * [interval [count]] * * -c CMD For each vdev, run command CMD * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; char *cmd = NULL; /* check options */ while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'p': cb.cb_literal = B_TRUE; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 's': cb.cb_print_slow_ios = B_TRUE; break; case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); ret = for_each_pool(argc, argv, B_TRUE, NULL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { (void) fprintf(stderr, gettext("Upgrade not performed because " "'compatibility' property set to '" ZPOOL_COMPAT_LEGACY "'.\n")); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) return (-1); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid) && requested_features[i]) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t modified_pool = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); modified_pool = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; modified_pool = B_TRUE; } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n" "Note that setting a pool's 'compatibility' " "feature to '" ZPOOL_COMPAT_LEGACY "' will\n" "inhibit upgrades.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(7) for " "details.\n\n" "Note that the pool " "'compatibility' feature can be " "used to inhibit\nfeature " "upgrades.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t modified_pool = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { modified_pool = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { modified_pool = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported and requested features enabled.\n"), zpool_get_name(zhp)); } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf(gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; if (!fi->fi_zfs_mod_supported) continue; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported and " "requested features enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported and requested features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; static void print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) { nvlist_t **records; uint_t numrecords; int i; verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[64] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) { uint64_t elapsed_ns = fnvlist_lookup_int64(records[i], ZPOOL_HIST_ELAPSED_NS); (void) snprintf(tbuf + strlen(tbuf), sizeof (tbuf) - strlen(tbuf), " (%lldms)", (long long)elapsed_ns / 1000 / 1000); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { (void) printf(" output nvlist omitted; " "original size: %lldKB\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_OUTPUT_SIZE) / 1024); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } } /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; int ret; hist_cbdata_t *cb = (hist_cbdata_t *)data; uint64_t off = 0; boolean_t eof = B_FALSE; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); while (!eof) { if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) return (ret); print_history_records(nvhis, cb); nvlist_free(nvhis); } (void) printf("\n"); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; char poolname[ZFS_MAX_DATASET_NAME_LEN]; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) { char ctime_str[26], str[32], *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); else (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_DONTCARE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; char *pool; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); if (strlen(opts->poolname) > 0 && nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && strcmp(opts->poolname, pool) != 0) continue; zpool_do_events_short(nvl, opts); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(ev_opts_t *opts) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vHf [pool] | -c] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } else if (argc == 1) { (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { (void) fprintf(stderr, gettext("invalid pool name '%s'\n"), opts.poolname); usage(B_FALSE); } } if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && opts.clear) { (void) fprintf(stderr, gettext("invalid options combined with -c\n")); usage(B_FALSE); } if (opts.clear) ret = zpool_do_events_clear(&opts); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[MAXNAMELEN]; zprop_source_t srctype; zprop_list_t *pl; for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *value; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 4: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); argc--; argv++; if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; boolean_t cb_any_successful; } set_cbdata_t; static int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; /* Check if we have out-of-bounds features */ if (strcmp(cb->cb_propname, ZPOOL_CONFIG_COMPATIBILITY) == 0) { boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(cb->cb_value, features) != ZPOOL_COMPATIBILITY_OK) return (-1); nvlist_t *enabled = zpool_get_features(zhp); spa_feature_t i; for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; if (nvlist_exists(enabled, fguid) && !features[i]) break; } if (i < SPA_FEATURES) (void) fprintf(stderr, gettext("Warning: one or " "more features already enabled on pool '%s'\n" "are not present in this compatibility set.\n"), zpool_get_name(zhp)); } /* if we're setting a feature, check it's in compatibility set */ if (zpool_prop_feature(cb->cb_propname) && strcmp(cb->cb_value, ZFS_FEATURE_ENABLED) == 0) { char *fname = strchr(cb->cb_propname, '@') + 1; spa_feature_t f; if (zfeature_lookup_name(fname, &f) == 0) { char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(compat, features) != ZPOOL_COMPATIBILITY_OK) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "because the pool's 'compatibility' " "property cannot be parsed.\n"), fname, zpool_get_name(zhp)); return (-1); } if (!features[f]) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "as it is not specified in this pool's " "current compatibility set.\n" "Consider setting 'compatibility' to a " "less restrictive set, or to 'off'.\n"), fname, zpool_get_name(zhp)); return (-1); } } } error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) cb->cb_any_successful = B_TRUE; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, B_FALSE, set_callback, &cb); return (error); } /* Add up the total number of bytes left to initialize/trim across all vdevs */ static uint64_t vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) { uint64_t bytes_remaining; nvlist_t **child; uint_t c, children; vdev_stat_t *vs; assert(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (activity == ZPOOL_WAIT_INITIALIZE && vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) bytes_remaining = vs->vs_initialize_bytes_est - vs->vs_initialize_bytes_done; else if (activity == ZPOOL_WAIT_TRIM && vs->vs_trim_state == VDEV_TRIM_ACTIVE) bytes_remaining = vs->vs_trim_bytes_est - vs->vs_trim_bytes_done; else bytes_remaining = 0; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) bytes_remaining += vdev_activity_remaining(child[c], activity); return (bytes_remaining); } /* Add up the total number of bytes left to rebuild across top-level vdevs */ static uint64_t vdev_activity_top_remaining(nvlist_t *nv) { uint64_t bytes_remaining = 0; nvlist_t **child; uint_t children; int error; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; error = nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); if (error == 0) { if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { bytes_remaining += (vrs->vrs_bytes_est - vrs->vrs_bytes_rebuilt); } } } return (bytes_remaining); } /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) { nvlist_t **child; uint_t c, children; char *vdev_type; (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) { if (vdev_any_spare_replacing(child[c])) return (B_TRUE); } return (B_FALSE); } typedef struct wait_data { char *wd_poolname; boolean_t wd_scripted; boolean_t wd_exact; boolean_t wd_headers_once; boolean_t wd_should_exit; /* Which activities to wait for */ boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; float wd_interval; pthread_cond_t wd_cv; pthread_mutex_t wd_mutex; } wait_data_t; /* * Print to stdout a single line, containing one column for each activity that * we are waiting for specifying how many bytes of work are left for that * activity. */ static void print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) { nvlist_t *config, *nvroot; uint_t c; int i; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { /* * Make sure we have enough space in the col for pretty-printed * numbers and for the column header, and then leave a couple * spaces between cols for readability. */ col_widths[i] = MAX(strlen(headers[i]), 6) + 2; } /* Print header if appropriate */ int term_height = terminal_height(); boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && row % (term_height-1) == 0); if (!wd->wd_scripted && (row == 0 || reprint_header)) { for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { if (wd->wd_enabled[i]) (void) printf("%*s", col_widths[i], headers[i]); } (void) printf("\n"); } /* Bytes of work remaining in each activity */ int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; bytes_rem[ZPOOL_WAIT_FREE] = zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); if (prs != NULL && prs->prs_state == DSS_SCANNING) bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - prs->prs_copied; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); if (pss != NULL && pss->pss_state == DSS_SCANNING && pss->pss_pass_scrub_pause == 0) { int64_t rem = pss->pss_to_examine - pss->pss_issued; if (pss->pss_func == POOL_SCAN_SCRUB) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; } else if (check_rebuilding(nvroot, NULL)) { bytes_rem[ZPOOL_WAIT_RESILVER] = vdev_activity_top_remaining(nvroot); } bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); /* * A replace finishes after resilvering finishes, so the amount of work * left for a replace is the same as for resilvering. * * It isn't quite correct to say that if we have any 'spare' or * 'replacing' vdevs and a resilver is happening, then a replace is in * progress, like we do here. When a hot spare is used, the faulted vdev * is not removed after the hot spare is resilvered, so parent 'spare' * vdev is not removed either. So we could have a 'spare' vdev, but be * resilvering for a different reason. However, we use it as a heuristic * because we don't have access to the DTLs, which could tell us whether * or not we have really finished resilvering a hot spare. */ if (vdev_any_spare_replacing(nvroot)) bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { char buf[64]; if (!wd->wd_enabled[i]) continue; if (wd->wd_exact) (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); else zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); else (void) printf(" %*s", col_widths[i] - 1, buf); } (void) printf("\n"); (void) fflush(stdout); } static void * wait_status_thread(void *arg) { wait_data_t *wd = (wait_data_t *)arg; zpool_handle_t *zhp; if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) return (void *)(1); for (int row = 0; ; row++) { boolean_t missing; struct timespec timeout; int ret = 0; (void) clock_gettime(CLOCK_REALTIME, &timeout); if (zpool_refresh_stats(zhp, &missing) != 0 || missing || zpool_props_refresh(zhp) != 0) { zpool_close(zhp); return (void *)(uintptr_t)(missing ? 0 : 1); } print_wait_status_row(wd, zhp, row); timeout.tv_sec += floor(wd->wd_interval); long nanos = timeout.tv_nsec + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; if (nanos >= NANOSEC) { timeout.tv_sec++; timeout.tv_nsec = nanos - NANOSEC; } else { timeout.tv_nsec = nanos; } pthread_mutex_lock(&wd->wd_mutex); if (!wd->wd_should_exit) ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, &timeout); pthread_mutex_unlock(&wd->wd_mutex); if (ret == 0) { break; /* signaled by main thread */ } else if (ret != ETIMEDOUT) { (void) fprintf(stderr, gettext("pthread_cond_timedwait " "failed: %s\n"), strerror(ret)); zpool_close(zhp); return (void *)(uintptr_t)(1); } } zpool_close(zhp); return (void *)(0); } int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; int c; char *value; int i; unsigned long count; pthread_t status_thr; int error = 0; zpool_handle_t *zhp; wait_data_t wd; wd.wd_scripted = B_FALSE; wd.wd_exact = B_FALSE; wd.wd_headers_once = B_FALSE; wd.wd_should_exit = B_FALSE; pthread_mutex_init(&wd.wd_mutex, NULL); pthread_cond_init(&wd.wd_cv, NULL); /* By default, wait for all types of activity. */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) wd.wd_enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "HpT:t:")) != -1) { switch (c) { case 'H': wd.wd_scripted = B_TRUE; break; case 'n': wd.wd_headers_once = B_TRUE; break; case 'p': wd.wd_exact = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 't': { static char *col_subopts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", NULL }; /* Reset activities array */ bzero(&wd.wd_enabled, sizeof (wd.wd_enabled)); while (*optarg != '\0') { int activity = getsubopt(&optarg, col_subopts, &value); if (activity < 0) { (void) fprintf(stderr, gettext("invalid activity '%s'\n"), value); usage(B_FALSE); } wd.wd_enabled[activity] = B_TRUE; } break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &wd.wd_interval, &count); if (count != 0) { /* This subcmd only accepts an interval, not a count */ (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (wd.wd_interval != 0) verbose = B_TRUE; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'pool' argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } wd.wd_poolname = argv[0]; if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) return (1); if (verbose) { /* * We use a separate thread for printing status updates because * the main thread will call lzc_wait(), which blocks as long * as an activity is in progress, which can be a long time. */ if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) != 0) { (void) fprintf(stderr, gettext("failed to create status" "thread: %s\n"), strerror(errno)); zpool_close(zhp); return (1); } } /* * Loop over all activities that we are supposed to wait for until none * of them are in progress. Note that this means we can end up waiting * for more activities to complete than just those that were in progress * when we began waiting; if an activity we are interested in begins * while we are waiting for another activity, we will wait for both to * complete before exiting. */ for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!wd.wd_enabled[i]) continue; error = zpool_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zpool_close(zhp); if (verbose) { uintptr_t status; pthread_mutex_lock(&wd.wd_mutex); wd.wd_should_exit = B_TRUE; pthread_cond_signal(&wd.wd_cv); pthread_mutex_unlock(&wd.wd_mutex); (void) pthread_join(status_thr, (void *)&status); if (status != 0) error = status; } pthread_mutex_destroy(&wd.wd_mutex); pthread_cond_destroy(&wd.wd_cv); return (error); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } /* * Display version message */ static int zpool_do_version(int argc, char **argv) { if (zfs_version_print() == -1) return (1); return (0); } /* * Do zpool_load_compat() and print error message on failure */ static zpool_compat_status_t zpool_do_load_compat(const char *compat, boolean_t *list) { char report[1024]; zpool_compat_status_t ret; ret = zpool_load_compat(compat, list, report, 1024); switch (ret) { case ZPOOL_COMPATIBILITY_OK: break; case ZPOOL_COMPATIBILITY_NOFILES: case ZPOOL_COMPATIBILITY_BADFILE: case ZPOOL_COMPATIBILITY_BADTOKEN: (void) fprintf(stderr, "Error: %s\n", report); break; case ZPOOL_COMPATIBILITY_WARNTOKEN: (void) fprintf(stderr, "Warning: %s\n", report); ret = ZPOOL_COMPATIBILITY_OK; break; } return (ret); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zpool_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); if (ret != 0) { (void) fprintf(stderr, gettext("failed to freeze pool: %d\n"), errno); ret = 1; } log_history = 0; } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h index 4002e5794021..6665eaf0d44e 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_util.h +++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h @@ -1,140 +1,140 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef ZPOOL_UTIL_H #define ZPOOL_UTIL_H #include #include +#include #ifdef __cplusplus extern "C" { #endif /* Path to scripts you can run with "zpool status/iostat -c" */ #define ZPOOL_SCRIPTS_DIR SYSCONFDIR"/zfs/zpool.d" /* * Basic utility functions */ void *safe_malloc(size_t); void *safe_realloc(void *, size_t); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); uint64_t array64_max(uint64_t array[], unsigned int len); int highbit64(uint64_t i); int lowbit64(uint64_t i); /* * Misc utility functions */ char *zpool_get_cmd_search_path(void); /* * Virtual device functions */ nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, boolean_t replacing, boolean_t dryrun, int argc, char **argv); nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, splitflags_t flags, int argc, char **argv); /* * Pool list functions */ int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, boolean_t, zpool_iter_f, void *); /* Vdev list functions */ -typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); typedef struct zpool_list zpool_list_t; zpool_list_t *pool_list_get(int, char **, zprop_list_t **, boolean_t, int *); void pool_list_update(zpool_list_t *); int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); void pool_list_free(zpool_list_t *); int pool_list_count(zpool_list_t *); void pool_list_remove(zpool_list_t *, zpool_handle_t *); extern libzfs_handle_t *g_zfs; typedef struct vdev_cmd_data { char **lines; /* Array of lines of output, minus the column name */ int lines_cnt; /* Number of lines in the array */ char **cols; /* Array of column names */ int cols_cnt; /* Number of column names */ char *path; /* vdev path */ char *upath; /* vdev underlying path */ char *pool; /* Pool name */ char *cmd; /* backpointer to cmd */ char *vdev_enc_sysfs_path; /* enclosure sysfs path (if any) */ } vdev_cmd_data_t; typedef struct vdev_cmd_data_list { char *cmd; /* Command to run */ unsigned int count; /* Number of vdev_cmd_data items (vdevs) */ /* fields used to select only certain vdevs, if requested */ libzfs_handle_t *g_zfs; char **vdev_names; int vdev_names_count; int cb_name_flags; vdev_cmd_data_t *data; /* Array of vdevs */ /* List of unique column names and widths */ char **uniq_cols; int uniq_cols_cnt; int *uniq_cols_width; } vdev_cmd_data_list_t; vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, int cb_name_flags); void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); int check_device(const char *path, boolean_t force, boolean_t isspare, boolean_t iswholedisk); boolean_t check_sector_size_database(char *path, int *sector_size); void vdev_error(const char *fmt, ...) __attribute__((format(printf, 1, 2))); int check_file(const char *file, boolean_t force, boolean_t isspare); void after_zpool_upgrade(zpool_handle_t *zhp); int check_file_generic(const char *file, boolean_t force, boolean_t isspare); #ifdef __cplusplus } #endif #endif /* ZPOOL_UTIL_H */ diff --git a/sys/contrib/openzfs/contrib/initramfs/hooks/zfs.in b/sys/contrib/openzfs/contrib/initramfs/hooks/zfs.in index bdf169fd96c2..9d5c397cf224 100755 --- a/sys/contrib/openzfs/contrib/initramfs/hooks/zfs.in +++ b/sys/contrib/openzfs/contrib/initramfs/hooks/zfs.in @@ -1,57 +1,56 @@ #!/bin/sh # # Add OpenZFS filesystem capabilities to an initrd, usually for a native ZFS root. # if [ "$1" = "prereqs" ]; then echo "udev" exit fi . /usr/share/initramfs-tools/hook-functions for req in "@sbindir@/zpool" "@sbindir@/zfs" "@mounthelperdir@/mount.zfs"; do copy_exec "$req" || { echo "$req not available!" >&2 exit 2 } done -copy_exec "@sbindir@/zdb" copy_exec "@udevdir@/vdev_id" copy_exec "@udevdir@/zvol_id" if command -v systemd-ask-password > /dev/null; then copy_exec "$(command -v systemd-ask-password)" fi # We use pthreads, but i-t from buster doesn't automatically # copy this indirect dependency: this can be removed when buster finally dies. find /lib/ -type f -name "libgcc_s.so.[1-9]" | while read -r libgcc; do copy_exec "$libgcc" done # shellcheck disable=SC2050 if [ @LIBFETCH_DYNAMIC@ != 0 ]; then find /lib/ -name @LIBFETCH_SONAME@ | while read -r libfetch; do copy_exec "$libfetch" done fi copy_file config "/etc/hostid" copy_file cache "@sysconfdir@/zfs/zpool.cache" copy_file config "@initconfdir@/zfs" copy_file config "@sysconfdir@/zfs/zfs-functions" copy_file config "@sysconfdir@/zfs/vdev_id.conf" copy_file rule "@udevruledir@/60-zvol.rules" copy_file rule "@udevruledir@/69-vdev.rules" manual_add_modules zfs if [ -f "/etc/hostname" ]; then copy_file config "/etc/hostname" else hostname="$(mktemp -t hostname.XXXXXXXXXX)" hostname > "$hostname" copy_file config "$hostname" "/etc/hostname" rm -f "$hostname" fi diff --git a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs index 306e6e157e62..35502291e6f2 100644 --- a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs +++ b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs @@ -1,994 +1,995 @@ # ZFS boot stub for initramfs-tools. # # In the initramfs environment, the /init script sources this stub to # override the default functions in the /scripts/local script. # # Enable this by passing boot=zfs on the kernel command line. # # $quiet, $root, $rpool, $bootfs come from the cmdline: # shellcheck disable=SC2154 # Source the common functions . /etc/zfs/zfs-functions # Start interactive shell. # Use debian's panic() if defined, because it allows to prevent shell access # by setting panic in cmdline (e.g. panic=0 or panic=15). # See "4.5 Disable root prompt on the initramfs" of Securing Debian Manual: # https://www.debian.org/doc/manuals/securing-debian-howto/ch4.en.html shell() { if command -v panic > /dev/null 2>&1; then panic else /bin/sh fi } # This runs any scripts that should run before we start importing # pools and mounting any filesystems. pre_mountroot() { if command -v run_scripts > /dev/null 2>&1 then if [ -f "/scripts/local-top" ] || [ -d "/scripts/local-top" ] then [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Running /scripts/local-top" run_scripts /scripts/local-top [ "$quiet" != "y" ] && zfs_log_end_msg fi if [ -f "/scripts/local-premount" ] || [ -d "/scripts/local-premount" ] then [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Running /scripts/local-premount" run_scripts /scripts/local-premount [ "$quiet" != "y" ] && zfs_log_end_msg fi fi } # If plymouth is available, hide the splash image. disable_plymouth() { if [ -x /bin/plymouth ] && /bin/plymouth --ping then /bin/plymouth hide-splash >/dev/null 2>&1 fi } # Get a ZFS filesystem property value. get_fs_value() { fs="$1" value=$2 "${ZFS}" get -H -ovalue "$value" "$fs" 2> /dev/null } # Find the 'bootfs' property on pool $1. # If the property does not contain '/', then ignore this # pool by exporting it again. find_rootfs() { pool="$1" # If 'POOL_IMPORTED' isn't set, no pool imported and therefore # we won't be able to find a root fs. [ -z "${POOL_IMPORTED}" ] && return 1 # If it's already specified, just keep it mounted and exit # User (kernel command line) must be correct. [ -n "${ZFS_BOOTFS}" ] && return 0 # Not set, try to find it in the 'bootfs' property of the pool. # NOTE: zpool does not support 'get -H -ovalue bootfs'... ZFS_BOOTFS=$("${ZPOOL}" list -H -obootfs "$pool") # Make sure it's not '-' and that it starts with /. if [ "${ZFS_BOOTFS}" != "-" ] && \ get_fs_value "${ZFS_BOOTFS}" mountpoint | grep -q '^/$' then # Keep it mounted POOL_IMPORTED=1 return 0 fi # Not boot fs here, export it and later try again.. "${ZPOOL}" export "$pool" POOL_IMPORTED= ZFS_BOOTFS= return 1 } # Support function to get a list of all pools, separated with ';' find_pools() { pools=$("$@" 2> /dev/null | \ grep -E "pool:|^[a-zA-Z0-9]" | \ sed 's@.*: @@' | \ tr '\n' ';') echo "${pools%%;}" # Return without the last ';'. } # Get a list of all available pools get_pools() { if [ -n "${ZFS_POOL_IMPORT}" ]; then echo "$ZFS_POOL_IMPORT" return 0 fi # Get the base list of available pools. available_pools=$(find_pools "$ZPOOL" import) # Just in case - seen it happen (that a pool isn't visible/found # with a simple "zpool import" but only when using the "-d" # option or setting ZPOOL_IMPORT_PATH). if [ -d "/dev/disk/by-id" ] then npools=$(find_pools "$ZPOOL" import -d /dev/disk/by-id) if [ -n "$npools" ] then # Because we have found extra pool(s) here, which wasn't # found 'normally', we need to force USE_DISK_BY_ID to # make sure we're able to actually import it/them later. USE_DISK_BY_ID='yes' if [ -n "$available_pools" ] then # Filter out duplicates (pools found with the simple # "zpool import" but which is also found with the # "zpool import -d ..."). npools=$(echo "$npools" | sed "s,$available_pools,,") # Add the list to the existing list of # available pools available_pools="$available_pools;$npools" else available_pools="$npools" fi fi fi # Filter out any exceptions... if [ -n "$ZFS_POOL_EXCEPTIONS" ] then found="" apools="" OLD_IFS="$IFS" ; IFS=";" for pool in $available_pools do for exception in $ZFS_POOL_EXCEPTIONS do [ "$pool" = "$exception" ] && continue 2 found="$pool" done if [ -n "$found" ] then if [ -n "$apools" ] then apools="$apools;$pool" else apools="$pool" fi fi done IFS="$OLD_IFS" available_pools="$apools" fi # Return list of available pools. echo "$available_pools" } # Import given pool $1 import_pool() { pool="$1" # Verify that the pool isn't already imported # Make as sure as we can to not require '-f' to import. "${ZPOOL}" get name,guid -o value -H 2>/dev/null | grep -Fxq "$pool" && return 0 # For backwards compatibility, make sure that ZPOOL_IMPORT_PATH is set # to something we can use later with the real import(s). We want to # make sure we find all by* dirs, BUT by-vdev should be first (if it # exists). if [ -n "$USE_DISK_BY_ID" ] && [ -z "$ZPOOL_IMPORT_PATH" ] then dirs="$(for dir in /dev/disk/by-* do # Ignore by-vdev here - we want it first! echo "$dir" | grep -q /by-vdev && continue [ ! -d "$dir" ] && continue printf "%s" "$dir:" done | sed 's,:$,,g')" if [ -d "/dev/disk/by-vdev" ] then # Add by-vdev at the beginning. ZPOOL_IMPORT_PATH="/dev/disk/by-vdev:" fi # ... and /dev at the very end, just for good measure. ZPOOL_IMPORT_PATH="$ZPOOL_IMPORT_PATH$dirs:/dev" fi # Needs to be exported for "zpool" to catch it. [ -n "$ZPOOL_IMPORT_PATH" ] && export ZPOOL_IMPORT_PATH [ "$quiet" != "y" ] && zfs_log_begin_msg \ "Importing pool '${pool}' using defaults" ZFS_CMD="${ZPOOL} import -N ${ZPOOL_FORCE} ${ZPOOL_IMPORT_OPTS}" ZFS_STDERR="$($ZFS_CMD "$pool" 2>&1)" ZFS_ERROR="$?" if [ "${ZFS_ERROR}" != 0 ] then [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" if [ -f "${ZPOOL_CACHE}" ] then [ "$quiet" != "y" ] && zfs_log_begin_msg \ "Importing pool '${pool}' using cachefile." ZFS_CMD="${ZPOOL} import -c ${ZPOOL_CACHE} -N ${ZPOOL_FORCE} ${ZPOOL_IMPORT_OPTS}" ZFS_STDERR="$($ZFS_CMD "$pool" 2>&1)" ZFS_ERROR="$?" fi if [ "${ZFS_ERROR}" != 0 ] then [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" disable_plymouth echo "" echo "Command: ${ZFS_CMD} '$pool'" echo "Message: $ZFS_STDERR" echo "Error: $ZFS_ERROR" echo "" echo "Failed to import pool '$pool'." echo "Manually import the pool and exit." shell fi fi [ "$quiet" != "y" ] && zfs_log_end_msg POOL_IMPORTED=1 return 0 } # Load ZFS modules # Loading a module in a initrd require a slightly different approach, # with more logging etc. load_module_initrd() { [ -n "$ROOTDELAY" ] && ZFS_INITRD_PRE_MOUNTROOT_SLEEP="$ROOTDELAY" if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ] 2>/dev/null then if [ "$quiet" != "y" ]; then zfs_log_begin_msg "Sleeping for" \ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP seconds..." fi sleep "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" [ "$quiet" != "y" ] && zfs_log_end_msg fi # Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear. if command -v wait_for_udev > /dev/null 2>&1 ; then wait_for_udev 10 elif command -v wait_for_dev > /dev/null 2>&1 ; then wait_for_dev fi # zpool import refuse to import without a valid /proc/self/mounts [ ! -f /proc/self/mounts ] && mount proc /proc # Load the module load_module "zfs" || return 1 if [ "$ZFS_INITRD_POST_MODPROBE_SLEEP" -gt 0 ] 2>/dev/null then if [ "$quiet" != "y" ]; then zfs_log_begin_msg "Sleeping for" \ "$ZFS_INITRD_POST_MODPROBE_SLEEP seconds..." fi sleep "$ZFS_INITRD_POST_MODPROBE_SLEEP" [ "$quiet" != "y" ] && zfs_log_end_msg fi return 0 } # Mount a given filesystem mount_fs() { fs="$1" # Check that the filesystem exists "${ZFS}" list -oname -tfilesystem -H "${fs}" > /dev/null 2>&1 || return 1 # Skip filesystems with canmount=off. The root fs should not have # canmount=off, but ignore it for backwards compatibility just in case. if [ "$fs" != "${ZFS_BOOTFS}" ] then canmount=$(get_fs_value "$fs" canmount) [ "$canmount" = "off" ] && return 0 fi # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) ZFS_CMD="mount -o zfsutil -t zfs" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in # clone_snap() if that's usable. mountpoint=$(get_fs_value "$fs" org.zol:mountpoint) if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ] || [ "$mountpoint" = "-" ] then if [ "$fs" != "${ZFS_BOOTFS}" ]; then # We don't have a proper mountpoint and this # isn't the root fs. return 0 else # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" fi fi # If it's not a legacy filesystem, it can only be a # native one... if [ "$mountpoint" = "legacy" ]; then ZFS_CMD="mount -t zfs" fi fi # Possibly decrypt a filesystem using native encryption. decrypt_fs "$fs" [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Mounting '${fs}' on '${rootmnt}/${mountpoint}'" [ -n "${ZFS_DEBUG}" ] && \ zfs_log_begin_msg "CMD: '$ZFS_CMD ${fs} ${rootmnt}/${mountpoint}'" ZFS_STDERR=$(${ZFS_CMD} "${fs}" "${rootmnt}/${mountpoint}" 2>&1) ZFS_ERROR=$? if [ "${ZFS_ERROR}" != 0 ] then [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" disable_plymouth echo "" echo "Command: ${ZFS_CMD} ${fs} ${rootmnt}/${mountpoint}" echo "Message: $ZFS_STDERR" echo "Error: $ZFS_ERROR" echo "" echo "Failed to mount ${fs} on ${rootmnt}/${mountpoint}." echo "Manually mount the filesystem and exit." shell else [ "$quiet" != "y" ] && zfs_log_end_msg fi return 0 } # Unlock a ZFS native encrypted filesystem. decrypt_fs() { fs="$1" # If pool encryption is active and the zfs command understands '-o encryption' if [ "$(zpool list -H -o feature@encryption "${fs%%/*}")" = 'active' ]; then # Determine dataset that holds key for root dataset ENCRYPTIONROOT="$(get_fs_value "${fs}" encryptionroot)" KEYLOCATION="$(get_fs_value "${ENCRYPTIONROOT}" keylocation)" echo "${ENCRYPTIONROOT}" > /run/zfs_fs_name # If root dataset is encrypted... if ! [ "${ENCRYPTIONROOT}" = "-" ]; then KEYSTATUS="$(get_fs_value "${ENCRYPTIONROOT}" keystatus)" # Continue only if the key needs to be loaded [ "$KEYSTATUS" = "unavailable" ] || return 0 # Do not prompt if key is stored noninteractively, if ! [ "${KEYLOCATION}" = "prompt" ]; then $ZFS load-key "${ENCRYPTIONROOT}" # Prompt with plymouth, if active elif /bin/plymouth --ping 2>/dev/null; then echo "plymouth" > /run/zfs_console_askpwd_cmd for _ in 1 2 3; do plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" | \ $ZFS load-key "${ENCRYPTIONROOT}" && break done # Prompt with systemd, if active elif [ -e /run/systemd/system ]; then echo "systemd-ask-password" > /run/zfs_console_askpwd_cmd for _ in 1 2 3; do systemd-ask-password "Encrypted ZFS password for ${ENCRYPTIONROOT}" --no-tty | \ $ZFS load-key "${ENCRYPTIONROOT}" && break done # Prompt with ZFS tty, otherwise else # Temporarily setting "printk" to "7" allows the prompt to appear even when the "quiet" kernel option has been used echo "load-key" > /run/zfs_console_askpwd_cmd storeprintk="$(awk '{print $1}' /proc/sys/kernel/printk)" echo 7 > /proc/sys/kernel/printk $ZFS load-key "${ENCRYPTIONROOT}" echo "$storeprintk" > /proc/sys/kernel/printk fi fi fi return 0 } # Destroy a given filesystem. destroy_fs() { fs="$1" [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Destroying '$fs'" ZFS_CMD="${ZFS} destroy $fs" ZFS_STDERR="$(${ZFS_CMD} 2>&1)" ZFS_ERROR="$?" if [ "${ZFS_ERROR}" != 0 ] then [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" disable_plymouth echo "" echo "Command: $ZFS_CMD" echo "Message: $ZFS_STDERR" echo "Error: $ZFS_ERROR" echo "" echo "Failed to destroy '$fs'. Please make sure that '$fs' is not available." echo "Hint: Try: zfs destroy -Rfn $fs" echo "If this dryrun looks good, then remove the 'n' from '-Rfn' and try again." shell else [ "$quiet" != "y" ] && zfs_log_end_msg fi return 0 } # Clone snapshot $1 to destination filesystem $2 # Set 'canmount=noauto' and 'mountpoint=none' so that we get to keep # manual control over it's mounting (i.e., make sure it's not automatically # mounted with a 'zfs mount -a' in the init/systemd scripts). clone_snap() { snap="$1" destfs="$2" mountpoint="$3" [ "$quiet" != "y" ] && zfs_log_begin_msg "Cloning '$snap' to '$destfs'" # Clone the snapshot into a dataset we can boot from # + We don't want this filesystem to be automatically mounted, we # want control over this here and nowhere else. # + We don't need any mountpoint set for the same reason. # We use the 'org.zol:mountpoint' property to remember the mountpoint. ZFS_CMD="${ZFS} clone -o canmount=noauto -o mountpoint=none" ZFS_CMD="${ZFS_CMD} -o org.zol:mountpoint=${mountpoint}" ZFS_CMD="${ZFS_CMD} $snap $destfs" ZFS_STDERR="$(${ZFS_CMD} 2>&1)" ZFS_ERROR="$?" if [ "${ZFS_ERROR}" != 0 ] then [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" disable_plymouth echo "" echo "Command: $ZFS_CMD" echo "Message: $ZFS_STDERR" echo "Error: $ZFS_ERROR" echo "" echo "Failed to clone snapshot." echo "Make sure that the any problems are corrected and then make sure" echo "that the dataset '$destfs' exists and is bootable." shell else [ "$quiet" != "y" ] && zfs_log_end_msg fi return 0 } # Rollback a given snapshot. rollback_snap() { snap="$1" [ "$quiet" != "y" ] && zfs_log_begin_msg "Rollback $snap" ZFS_CMD="${ZFS} rollback -Rf $snap" ZFS_STDERR="$(${ZFS_CMD} 2>&1)" ZFS_ERROR="$?" if [ "${ZFS_ERROR}" != 0 ] then [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" disable_plymouth echo "" echo "Command: $ZFS_CMD" echo "Message: $ZFS_STDERR" echo "Error: $ZFS_ERROR" echo "" echo "Failed to rollback snapshot." shell else [ "$quiet" != "y" ] && zfs_log_end_msg fi return 0 } # Get a list of snapshots, give them as a numbered list # to the user to choose from. ask_user_snap() { fs="$1" # We need to temporarily disable debugging. Set 'debug' so we # remember to enabled it again. if [ -n "${ZFS_DEBUG}" ]; then unset ZFS_DEBUG set +x debug=1 fi # Because we need the resulting snapshot, which is sent on # stdout to the caller, we use stderr for our questions. echo "What snapshot do you want to boot from?" > /dev/stderr # shellcheck disable=SC2046 IFS=" " set -- $("${ZFS}" list -H -oname -tsnapshot -r "${fs}") i=1 for snap in "$@"; do echo " $i: $snap" i=$((i + 1)) done > /dev/stderr # expr instead of test here because [ a -lt 0 ] errors out, # but expr falls back to lexicographical, which works out right snapnr=0 while expr "$snapnr" "<" 1 > /dev/null || expr "$snapnr" ">" "$#" > /dev/null do printf "%s" "Snap nr [1-$#]? " > /dev/stderr read -r snapnr done # Re-enable debugging. if [ -n "${debug}" ]; then ZFS_DEBUG=1 set -x fi eval echo '$'"$snapnr" } setup_snapshot_booting() { snap="$1" retval=0 # Make sure that the snapshot specified actually exists. if [ ! "$(get_fs_value "${snap}" type)" ] then # Snapshot does not exist (...@ ?) # ask the user for a snapshot to use. snap="$(ask_user_snap "${snap%%@*}")" fi # Separate the full snapshot ('$snap') into it's filesystem and # snapshot names. Would have been nice with a split() function.. rootfs="${snap%%@*}" snapname="${snap##*@}" ZFS_BOOTFS="${rootfs}_${snapname}" if ! grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline then # If the destination dataset for the clone # already exists, destroy it. Recursively if [ "$(get_fs_value "${rootfs}_${snapname}" type)" ]; then filesystems=$("${ZFS}" list -oname -tfilesystem -H \ -r -Sname "${ZFS_BOOTFS}") for fs in $filesystems; do destroy_fs "${fs}" done fi fi # Get all snapshots, recursively (might need to clone /usr, /var etc # as well). for s in $("${ZFS}" list -H -oname -tsnapshot -r "${rootfs}" | \ grep "${snapname}") do if grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline then # Rollback snapshot rollback_snap "$s" || retval=$((retval + 1)) + ZFS_BOOTFS="${rootfs}" else # Setup a destination filesystem name. # Ex: Called with 'rpool/ROOT/debian@snap2' # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_snap2 # rpool/ROOT/debian/boot@snap2 => rpool/ROOT/debian_snap2/boot # rpool/ROOT/debian/usr@snap2 => rpool/ROOT/debian_snap2/usr # rpool/ROOT/debian/var@snap2 => rpool/ROOT/debian_snap2/var subfs="${s##$rootfs}" subfs="${subfs%%@$snapname}" destfs="${rootfs}_${snapname}" # base fs. [ -n "$subfs" ] && destfs="${destfs}$subfs" # + sub fs. # Get the mountpoint of the filesystem, to be used # with clone_snap(). If legacy or none, then use # the sub fs value. mountpoint=$(get_fs_value "${s%%@*}" mountpoint) if [ "$mountpoint" = "legacy" ] || \ [ "$mountpoint" = "none" ] then if [ -n "${subfs}" ]; then mountpoint="${subfs}" else mountpoint="/" fi fi # Clone the snapshot into its own # filesystem clone_snap "$s" "${destfs}" "${mountpoint}" || \ retval=$((retval + 1)) fi done # If we haven't return yet, we have a problem... return "${retval}" } # ================================================================ # This is the main function. mountroot() { # ---------------------------------------------------------------- # I N I T I A L S E T U P # ------------ # Run the pre-mount scripts from /scripts/local-top. pre_mountroot # ------------ # Source the default setup variables. [ -r '/etc/default/zfs' ] && . /etc/default/zfs # ------------ # Support debug option if grep -qiE '(^|[^\\](\\\\)* )(zfs_debug|zfs\.debug|zfsdebug)=(on|yes|1)( |$)' /proc/cmdline then ZFS_DEBUG=1 mkdir /var/log #exec 2> /var/log/boot.debug set -x fi # ------------ # Load ZFS module etc. if ! load_module_initrd; then disable_plymouth echo "" echo "Failed to load ZFS modules." echo "Manually load the modules and exit." shell fi # ------------ # Look for the cache file (if any). [ -f "${ZPOOL_CACHE}" ] || unset ZPOOL_CACHE [ -s "${ZPOOL_CACHE}" ] || unset ZPOOL_CACHE # ------------ # Compatibility: 'ROOT' is for Debian GNU/Linux (etc), # 'root' is for Redhat/Fedora (etc), # 'REAL_ROOT' is for Gentoo if [ -z "$ROOT" ] then [ -n "$root" ] && ROOT=${root} [ -n "$REAL_ROOT" ] && ROOT=${REAL_ROOT} fi # ------------ # Where to mount the root fs in the initrd - set outside this script # Compatibility: 'rootmnt' is for Debian GNU/Linux (etc), # 'NEWROOT' is for RedHat/Fedora (etc), # 'NEW_ROOT' is for Gentoo if [ -z "$rootmnt" ] then [ -n "$NEWROOT" ] && rootmnt=${NEWROOT} [ -n "$NEW_ROOT" ] && rootmnt=${NEW_ROOT} fi # ------------ # No longer set in the defaults file, but it could have been set in # get_pools() in some circumstances. If it's something, but not 'yes', # it's no good to us. [ -n "$USE_DISK_BY_ID" ] && [ "$USE_DISK_BY_ID" != 'yes' ] && \ unset USE_DISK_BY_ID # ---------------------------------------------------------------- # P A R S E C O M M A N D L I N E O P T I O N S # This part is the really ugly part - there's so many options and permutations # 'out there', and if we should make this the 'primary' source for ZFS initrd # scripting, we need/should support them all. # # Supports the following kernel command line argument combinations # (in this order - first match win): # # rpool= (tries to finds bootfs automatically) # bootfs=/ (uses this for rpool - first part) # rpool= bootfs=/ # -B zfs-bootfs=/ (uses this for rpool - first part) # rpool=rpool (default if none of the above is used) # root=/ (uses this for rpool - first part) # root=ZFS=/ (uses this for rpool - first part, without 'ZFS=') # root=zfs:AUTO (tries to detect both pool and rootfs # root=zfs:/ (uses this for rpool - first part, without 'zfs:') # # Option could also be # Option could also be # ------------ # Support force option # In addition, setting one of zfs_force, zfs.force or zfsforce to # 'yes', 'on' or '1' will make sure we force import the pool. # This should (almost) never be needed, but it's here for # completeness. ZPOOL_FORCE="" if grep -qiE '(^|[^\\](\\\\)* )(zfs_force|zfs\.force|zfsforce)=(on|yes|1)( |$)' /proc/cmdline then ZPOOL_FORCE="-f" fi # ------------ # Look for 'rpool' and 'bootfs' parameter [ -n "$rpool" ] && ZFS_RPOOL="${rpool#rpool=}" [ -n "$bootfs" ] && ZFS_BOOTFS="${bootfs#bootfs=}" # ------------ # If we have 'ROOT' (see above), but not 'ZFS_BOOTFS', then use # 'ROOT' [ -n "$ROOT" ] && [ -z "${ZFS_BOOTFS}" ] && ZFS_BOOTFS="$ROOT" # ------------ # Check for the `-B zfs-bootfs=%s/%u,...` kind of parameter. # NOTE: Only use the pool name and dataset. The rest is not # supported by OpenZFS (whatever it's for). if [ -z "$ZFS_RPOOL" ] then # The ${zfs-bootfs} variable is set at the kernel command # line, usually by GRUB, but it cannot be referenced here # directly because bourne variable names cannot contain a # hyphen. # # Reassign the variable by dumping the environment and # stripping the zfs-bootfs= prefix. Let the shell handle # quoting through the eval command: # shellcheck disable=SC2046 eval ZFS_RPOOL=$(set | sed -n -e 's,^zfs-bootfs=,,p') fi # ------------ # No root fs or pool specified - do auto detect. if [ -z "$ZFS_RPOOL" ] && [ -z "${ZFS_BOOTFS}" ] then # Do auto detect. Do this by 'cheating' - set 'root=zfs:AUTO' # which will be caught later ROOT='zfs:AUTO' fi # ---------------------------------------------------------------- # F I N D A N D I M P O R T C O R R E C T P O O L # ------------ if [ "$ROOT" = "zfs:AUTO" ] then # Try to detect both pool and root fs. # If we got here, that means we don't have a hint so as to # the root dataset, but with root=zfs:AUTO on cmdline, # this says "zfs:AUTO" here and interferes with checks later ZFS_BOOTFS= [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Attempting to import additional pools." # Get a list of pools available for import if [ -n "$ZFS_RPOOL" ] then # We've specified a pool - check only that POOLS=$ZFS_RPOOL else POOLS=$(get_pools) fi OLD_IFS="$IFS" ; IFS=";" for pool in $POOLS do [ -z "$pool" ] && continue IFS="$OLD_IFS" import_pool "$pool" IFS="$OLD_IFS" find_rootfs "$pool" && break done IFS="$OLD_IFS" [ "$quiet" != "y" ] && zfs_log_end_msg $ZFS_ERROR else # No auto - use value from the command line option. # Strip 'zfs:' and 'ZFS='. ZFS_BOOTFS="${ROOT#*[:=]}" # Strip everything after the first slash. ZFS_RPOOL="${ZFS_BOOTFS%%/*}" fi # Import the pool (if not already done so in the AUTO check above). if [ -n "$ZFS_RPOOL" ] && [ -z "${POOL_IMPORTED}" ] then [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Importing ZFS root pool '$ZFS_RPOOL'" import_pool "${ZFS_RPOOL}" find_rootfs "${ZFS_RPOOL}" [ "$quiet" != "y" ] && zfs_log_end_msg fi if [ -z "${POOL_IMPORTED}" ] then # No pool imported, this is serious! disable_plymouth echo "" echo "Command: $ZFS_CMD" echo "Message: $ZFS_STDERR" echo "Error: $ZFS_ERROR" echo "" echo "No pool imported. Manually import the root pool" echo "at the command prompt and then exit." echo "Hint: Try: zpool import -N ${ZFS_RPOOL}" shell fi # In case the pool was specified as guid, resolve guid to name pool="$("${ZPOOL}" get name,guid -o name,value -H | \ awk -v pool="${ZFS_RPOOL}" '$2 == pool { print $1 }')" if [ -n "$pool" ]; then # If $ZFS_BOOTFS contains guid, replace the guid portion with $pool ZFS_BOOTFS=$(echo "$ZFS_BOOTFS" | \ sed -e "s/$("${ZPOOL}" get guid -o value "$pool" -H)/$pool/g") ZFS_RPOOL="${pool}" fi # ---------------------------------------------------------------- # P R E P A R E R O O T F I L E S Y S T E M if [ -n "${ZFS_BOOTFS}" ] then # Booting from a snapshot? # Will overwrite the ZFS_BOOTFS variable like so: # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_snap2 echo "${ZFS_BOOTFS}" | grep -q '@' && \ setup_snapshot_booting "${ZFS_BOOTFS}" fi if [ -z "${ZFS_BOOTFS}" ] then # Still nothing! Let the user sort this out. disable_plymouth echo "" echo "Error: Unknown root filesystem - no 'bootfs' pool property and" echo " not specified on the kernel command line." echo "" echo "Manually mount the root filesystem on $rootmnt and then exit." echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" shell fi # ---------------------------------------------------------------- # M O U N T F I L E S Y S T E M S # * Ideally, the root filesystem would be mounted like this: # # zpool import -R "$rootmnt" -N "$ZFS_RPOOL" # zfs mount -o mountpoint=/ "${ZFS_BOOTFS}" # # but the MOUNTPOINT prefix is preserved on descendent filesystem # after the pivot into the regular root, which later breaks things # like `zfs mount -a` and the /proc/self/mounts refresh. # # * Mount additional filesystems required # Such as /usr, /var, /usr/local etc. # NOTE: Mounted in the order specified in the # ZFS_INITRD_ADDITIONAL_DATASETS variable so take care! # Go through the complete list (recursively) of all filesystems below # the real root dataset filesystems="$("${ZFS}" list -oname -tfilesystem -H -r "${ZFS_BOOTFS}")" OLD_IFS="$IFS" ; IFS=" " for fs in $filesystems; do IFS="$OLD_IFS" mount_fs "$fs" done IFS="$OLD_IFS" for fs in $ZFS_INITRD_ADDITIONAL_DATASETS; do mount_fs "$fs" done touch /run/zfs_unlock_complete if [ -e /run/zfs_unlock_complete_notify ]; then read -r < /run/zfs_unlock_complete_notify fi # ------------ # Debugging information if [ -n "${ZFS_DEBUG}" ] then #exec 2>&1- echo "DEBUG: imported pools:" "${ZPOOL}" list -H echo echo "DEBUG: mounted ZFS filesystems:" mount | grep zfs echo echo "=> waiting for ENTER before continuing because of 'zfsdebug=1'. " printf "%s" " 'c' for shell, 'r' for reboot, 'ENTER' to continue. " read -r b [ "$b" = "c" ] && /bin/sh [ "$b" = "r" ] && reboot -f set +x fi # ------------ # Run local bottom script if command -v run_scripts > /dev/null 2>&1 then if [ -f "/scripts/local-bottom" ] || [ -d "/scripts/local-bottom" ] then [ "$quiet" != "y" ] && \ zfs_log_begin_msg "Running /scripts/local-bottom" run_scripts /scripts/local-bottom [ "$quiet" != "y" ] && zfs_log_end_msg fi fi } diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index 270f810027bb..c0883a983678 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -1,969 +1,970 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, Colm Buckley */ #ifndef _LIBZFS_H #define _LIBZFS_H extern __attribute__((visibility("default"))) #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Miscellaneous ZFS constants */ #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN /* * libzfs errors */ typedef enum zfs_error { EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ EZFS_PROPTYPE, /* property does not apply to dataset type */ EZFS_PROPNONINHERIT, /* property is not inheritable */ EZFS_PROPSPACE, /* bad quota or reservation */ EZFS_BADTYPE, /* dataset is not of appropriate type */ EZFS_BUSY, /* pool or dataset is busy */ EZFS_EXISTS, /* pool or dataset already exists */ EZFS_NOENT, /* no such pool or dataset */ EZFS_BADSTREAM, /* bad backup stream */ EZFS_DSREADONLY, /* dataset is readonly */ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ EZFS_INVALIDNAME, /* invalid dataset name */ EZFS_BADRESTORE, /* unable to restore to destination */ EZFS_BADBACKUP, /* backup failed */ EZFS_BADTARGET, /* bad attach/detach/replace target */ EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ EZFS_BADPATH, /* must be an absolute path */ EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ EZFS_UNSHARENFSFAILED, /* failed to unshare over nfs */ EZFS_SHARENFSFAILED, /* failed to share over nfs */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ EZFS_IO, /* I/O error */ EZFS_INTR, /* signal received */ EZFS_ISSPARE, /* device is a hot spare */ EZFS_INVALCONFIG, /* invalid vdev configuration */ EZFS_RECURSIVE, /* recursive dependency */ EZFS_NOHISTORY, /* no history object */ EZFS_POOLPROPS, /* couldn't retrieve pool props */ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ EZFS_NAMETOOLONG, /* dataset name is too long */ EZFS_OPENFAILED, /* open of device failed */ EZFS_NOCAP, /* couldn't get capacity */ EZFS_LABELFAILED, /* write of label failed */ EZFS_BADWHO, /* invalid permission who */ EZFS_BADPERM, /* invalid permission */ EZFS_BADPERMSET, /* invalid permission set name */ EZFS_NODELEGATION, /* delegated administration is disabled */ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ EZFS_ISL2CACHE, /* device is for the level 2 ARC */ EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ EZFS_REFTAG_RELE, /* snapshot release: tag not found */ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ EZFS_NO_SCRUB, /* no active scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ EZFS_ACTIVE_POOL, /* pool is imported on a different system */ EZFS_CRYPTOFAILED, /* failed to setup encryption */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ EZFS_TOOMANY, /* argument list too long */ EZFS_INITIALIZING, /* currently initializing */ EZFS_NO_INITIALIZE, /* no active initialize */ EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */ EZFS_TRIMMING, /* currently trimming */ EZFS_NO_TRIM, /* no active trim */ EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_UNKNOWN } zfs_error_t; /* * The following data structures are all part * of the zfs_allow_t data structure which is * used for printing 'allow' permissions. * It is a linked list of zfs_allow_t's which * then contain avl tree's for user/group/sets/... * and each one of the entries in those trees have * avl tree's for the permissions they belong to and * whether they are local,descendent or local+descendent * permissions. The AVL trees are used primarily for * sorting purposes, but also so that we can quickly find * a given user and or permission. */ typedef struct zfs_perm_node { avl_node_t z_node; char z_pname[MAXPATHLEN]; } zfs_perm_node_t; typedef struct zfs_allow_node { avl_node_t z_node; char z_key[MAXPATHLEN]; /* name, such as joe */ avl_tree_t z_localdescend; /* local+descendent perms */ avl_tree_t z_local; /* local permissions */ avl_tree_t z_descend; /* descendent permissions */ } zfs_allow_node_t; typedef struct zfs_allow { struct zfs_allow *z_next; char z_setpoint[MAXPATHLEN]; avl_tree_t z_sets; avl_tree_t z_crperms; avl_tree_t z_user; avl_tree_t z_group; avl_tree_t z_everyone; } zfs_allow_t; /* * Basic handle types */ typedef struct zfs_handle zfs_handle_t; typedef struct zpool_handle zpool_handle_t; typedef struct libzfs_handle libzfs_handle_t; _LIBZFS_H int zpool_wait(zpool_handle_t *, zpool_wait_activity_t); _LIBZFS_H int zpool_wait_status(zpool_handle_t *, zpool_wait_activity_t, boolean_t *, boolean_t *); /* * Library initialization */ _LIBZFS_H libzfs_handle_t *libzfs_init(void); _LIBZFS_H void libzfs_fini(libzfs_handle_t *); _LIBZFS_H libzfs_handle_t *zpool_get_handle(zpool_handle_t *); _LIBZFS_H libzfs_handle_t *zfs_get_handle(zfs_handle_t *); _LIBZFS_H void libzfs_print_on_error(libzfs_handle_t *, boolean_t); _LIBZFS_H void zfs_save_arguments(int argc, char **, char *, int); _LIBZFS_H int zpool_log_history(libzfs_handle_t *, const char *); _LIBZFS_H int libzfs_errno(libzfs_handle_t *); _LIBZFS_H const char *libzfs_error_init(int); _LIBZFS_H const char *libzfs_error_action(libzfs_handle_t *); _LIBZFS_H const char *libzfs_error_description(libzfs_handle_t *); _LIBZFS_H int zfs_standard_error(libzfs_handle_t *, int, const char *); _LIBZFS_H void libzfs_mnttab_init(libzfs_handle_t *); _LIBZFS_H void libzfs_mnttab_fini(libzfs_handle_t *); _LIBZFS_H void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); _LIBZFS_H int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); _LIBZFS_H void libzfs_mnttab_add(libzfs_handle_t *, const char *, const char *, const char *); _LIBZFS_H void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions */ _LIBZFS_H zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); _LIBZFS_H zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); _LIBZFS_H void zpool_close(zpool_handle_t *); _LIBZFS_H const char *zpool_get_name(zpool_handle_t *); _LIBZFS_H int zpool_get_state(zpool_handle_t *); _LIBZFS_H const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); _LIBZFS_H const char *zpool_pool_state_to_name(pool_state_t); _LIBZFS_H void zpool_free_handles(libzfs_handle_t *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); _LIBZFS_H int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); _LIBZFS_H boolean_t zpool_skip_pool(const char *); /* * Functions to create and destroy pools */ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); _LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { /* do not split, but return the config that would be split off */ int dryrun : 1; /* after splitting, import the pool */ int import : 1; int name_flags; } splitflags_t; typedef struct trimflags { /* requested vdevs are for the entire pool */ boolean_t fullpool; /* request a secure trim, requires support from device */ boolean_t secure; /* after starting trim, block until trim completes */ boolean_t wait; /* trim at the requested rate in bytes/second */ uint64_t rate; } trimflags_t; /* * Functions to manipulate pool and vdev state */ _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, trimflags_t *); _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zpool_reguid(zpool_handle_t *); _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); _LIBZFS_H int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int, boolean_t); _LIBZFS_H int zpool_vdev_detach(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_remove(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_remove_cancel(zpool_handle_t *); _LIBZFS_H int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); _LIBZFS_H int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); _LIBZFS_H int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_clear(zpool_handle_t *, uint64_t); _LIBZFS_H nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); _LIBZFS_H uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); _LIBZFS_H const char *zpool_get_state_str(zpool_handle_t *); /* * Functions to manage pool properties */ _LIBZFS_H int zpool_set_prop(zpool_handle_t *, const char *, const char *); _LIBZFS_H int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t literal); _LIBZFS_H uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); _LIBZFS_H int zpool_props_refresh(zpool_handle_t *); _LIBZFS_H const char *zpool_prop_to_name(zpool_prop_t); _LIBZFS_H const char *zpool_prop_values(zpool_prop_t); /* * Pool health statistics. */ typedef enum { /* * The following correspond to faults as defined in the (fault.fs.zfs.*) * event namespace. Each is associated with a corresponding message ID. * This must be kept in sync with the zfs_msgid_table in * lib/libzfs/libzfs_status.c. */ ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_HOSTID_ACTIVE, /* currently active on another system */ ZPOOL_STATUS_HOSTID_REQUIRED, /* multihost=on and hostid=0 */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ZPOOL_STATUS_IO_FAILURE_MMP, /* failed MMP, failmode not 'panic' */ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ ZPOOL_STATUS_ERRATA, /* informational errata available */ /* * If the pool has unsupported features but can still be opened in * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the * pool has unsupported features but cannot be opened at all, its * status is ZPOOL_STATUS_UNSUP_FEAT_READ. */ ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* * The following are not faults per se, but still an error possibly * requiring administrative attention. There is no corresponding * message ID. */ ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ /* * Finally, the following indicates a healthy pool. */ ZPOOL_STATUS_OK } zpool_status_t; _LIBZFS_H zpool_status_t zpool_get_status(zpool_handle_t *, char **, zpool_errata_t *); _LIBZFS_H zpool_status_t zpool_import_status(nvlist_t *, char **, zpool_errata_t *); /* * Statistics and configuration functions. */ _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); _LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); _LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); _LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); /* * Import and export functions */ _LIBZFS_H int zpool_export(zpool_handle_t *, boolean_t, const char *); _LIBZFS_H int zpool_export_force(zpool_handle_t *, const char *); _LIBZFS_H int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); _LIBZFS_H int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); _LIBZFS_H void zpool_print_unsup_feat(nvlist_t *config); /* * Miscellaneous pool functions */ struct zfs_cmd; _LIBZFS_H const char *zfs_history_event_names[]; typedef enum { VDEV_NAME_PATH = 1 << 0, VDEV_NAME_GUID = 1 << 1, VDEV_NAME_FOLLOW_LINKS = 1 << 2, VDEV_NAME_TYPE_ID = 1 << 3, } vdev_name_t; _LIBZFS_H char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, int name_flags); _LIBZFS_H int zpool_upgrade(zpool_handle_t *, uint64_t); _LIBZFS_H int zpool_get_history(zpool_handle_t *, nvlist_t **, uint64_t *, boolean_t *); _LIBZFS_H int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, int); _LIBZFS_H int zpool_events_clear(libzfs_handle_t *, int *); _LIBZFS_H int zpool_events_seek(libzfs_handle_t *, uint64_t, int); _LIBZFS_H void zpool_obj_to_path_ds(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); _LIBZFS_H void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); _LIBZFS_H int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); _LIBZFS_H int zpool_get_physpath(zpool_handle_t *, char *, size_t); _LIBZFS_H void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); _LIBZFS_H int zpool_checkpoint(zpool_handle_t *); _LIBZFS_H int zpool_discard_checkpoint(zpool_handle_t *); _LIBZFS_H boolean_t zpool_is_draid_spare(const char *); /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. */ _LIBZFS_H zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); _LIBZFS_H zfs_handle_t *zfs_handle_dup(zfs_handle_t *); _LIBZFS_H void zfs_close(zfs_handle_t *); _LIBZFS_H zfs_type_t zfs_get_type(const zfs_handle_t *); _LIBZFS_H zfs_type_t zfs_get_underlying_type(const zfs_handle_t *); _LIBZFS_H const char *zfs_get_name(const zfs_handle_t *); _LIBZFS_H zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); _LIBZFS_H const char *zfs_get_pool_name(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, * and are found in sys/fs/zfs.h. */ /* * zfs dataset property management */ _LIBZFS_H const char *zfs_prop_default_string(zfs_prop_t); _LIBZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); _LIBZFS_H const char *zfs_prop_column_name(zfs_prop_t); _LIBZFS_H boolean_t zfs_prop_align_right(zfs_prop_t); _LIBZFS_H nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, boolean_t, const char *); _LIBZFS_H const char *zfs_prop_to_name(zfs_prop_t); _LIBZFS_H int zfs_prop_set(zfs_handle_t *, const char *, const char *); _LIBZFS_H int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); _LIBZFS_H int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); _LIBZFS_H int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, boolean_t); _LIBZFS_H int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); _LIBZFS_H int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); _LIBZFS_H int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); _LIBZFS_H int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); _LIBZFS_H int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); _LIBZFS_H int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); _LIBZFS_H uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); _LIBZFS_H uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); _LIBZFS_H int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); _LIBZFS_H const char *zfs_prop_values(zfs_prop_t); _LIBZFS_H int zfs_prop_is_string(zfs_prop_t prop); _LIBZFS_H nvlist_t *zfs_get_all_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_user_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_recvd_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); _LIBZFS_H int zfs_wait_status(zfs_handle_t *, zfs_wait_activity_t, boolean_t *, boolean_t *); /* * zfs encryption management */ _LIBZFS_H int zfs_crypto_get_encryption_root(zfs_handle_t *, boolean_t *, char *); _LIBZFS_H int zfs_crypto_create(libzfs_handle_t *, char *, nvlist_t *, nvlist_t *, boolean_t stdin_available, uint8_t **, uint_t *); _LIBZFS_H int zfs_crypto_clone_check(libzfs_handle_t *, zfs_handle_t *, char *, nvlist_t *); _LIBZFS_H int zfs_crypto_attempt_load_keys(libzfs_handle_t *, char *); _LIBZFS_H int zfs_crypto_load_key(zfs_handle_t *, boolean_t, char *); _LIBZFS_H int zfs_crypto_unload_key(zfs_handle_t *); _LIBZFS_H int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); typedef struct zprop_list { int pl_prop; char *pl_user_prop; struct zprop_list *pl_next; boolean_t pl_all; size_t pl_width; size_t pl_recvd_width; boolean_t pl_fixed; } zprop_list_t; _LIBZFS_H int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, boolean_t); _LIBZFS_H void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" #define ZFS_FEATURE_DISABLED "disabled" #define ZFS_FEATURE_ENABLED "enabled" #define ZFS_FEATURE_ACTIVE "active" #define ZFS_UNSUPPORTED_INACTIVE "inactive" #define ZFS_UNSUPPORTED_READONLY "readonly" /* * zpool property management */ _LIBZFS_H int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **, boolean_t); _LIBZFS_H int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); _LIBZFS_H const char *zpool_prop_default_string(zpool_prop_t); _LIBZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); _LIBZFS_H const char *zpool_prop_column_name(zpool_prop_t); _LIBZFS_H boolean_t zpool_prop_align_right(zpool_prop_t); /* * Functions shared by zfs and zpool property management. */ _LIBZFS_H int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type); _LIBZFS_H int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); _LIBZFS_H void zprop_free_list(zprop_list_t *); #define ZFS_GET_NCOLS 5 typedef enum { GET_COL_NONE, GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE } zfs_get_column_t; /* * Functions for printing zfs or zpool properties */ typedef struct zprop_get_cbdata { int cb_sources; zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; int cb_colwidths[ZFS_GET_NCOLS + 1]; boolean_t cb_scripted; boolean_t cb_literal; boolean_t cb_first; zprop_list_t *cb_proplist; zfs_type_t cb_type; } zprop_get_cbdata_t; _LIBZFS_H void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); /* * Iterator functions. */ typedef int (*zfs_iter_f)(zfs_handle_t *, void *); _LIBZFS_H int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; size_t cb_alloc; size_t cb_used; } get_all_cb_t; _LIBZFS_H void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, zfs_iter_f, void *, boolean_t); _LIBZFS_H void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* * Functions to create and destroy datasets. */ _LIBZFS_H int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); _LIBZFS_H int zfs_create_ancestors(libzfs_handle_t *, const char *); _LIBZFS_H int zfs_destroy(zfs_handle_t *, boolean_t); _LIBZFS_H int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); _LIBZFS_H int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); +_LIBZFS_H int zfs_destroy_snaps_nvl_os(libzfs_handle_t *, nvlist_t *); _LIBZFS_H int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); _LIBZFS_H int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props); _LIBZFS_H int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); typedef struct renameflags { /* recursive rename */ int recursive : 1; /* don't unmount file systems */ int nounmount : 1; /* force unmount file systems */ int forceunmount : 1; } renameflags_t; _LIBZFS_H int zfs_rename(zfs_handle_t *, const char *, renameflags_t); typedef struct sendflags { /* Amount of extra information to print. */ int verbosity; /* recursive send (ie, -R) */ boolean_t replicate; /* for recursive send, skip sending missing snapshots */ boolean_t skipmissing; /* for incrementals, do all intermediate snapshots */ boolean_t doall; /* if dataset is a clone, do incremental from its origin */ boolean_t fromorigin; /* field no longer used, maintained for backwards compatibility */ boolean_t pad; /* send properties (ie, -p) */ boolean_t props; /* do not send (no-op, ie. -n) */ boolean_t dryrun; /* parsable verbose output (ie. -P) */ boolean_t parsable; /* show progress (ie. -v) */ boolean_t progress; /* large blocks (>128K) are permitted */ boolean_t largeblock; /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; /* compressed WRITE records are permitted */ boolean_t compress; /* raw encrypted records are permitted */ boolean_t raw; /* only send received properties (ie. -b) */ boolean_t backup; /* include snapshot holds in send stream */ boolean_t holds; /* stream represents a partially received dataset */ boolean_t saved; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); _LIBZFS_H int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); _LIBZFS_H int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t *, const char *); _LIBZFS_H int zfs_send_progress(zfs_handle_t *, int, uint64_t *, uint64_t *); _LIBZFS_H int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, const char *); _LIBZFS_H int zfs_send_saved(zfs_handle_t *, sendflags_t *, int, const char *); _LIBZFS_H nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token); _LIBZFS_H int zfs_promote(zfs_handle_t *); _LIBZFS_H int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, int); _LIBZFS_H int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); _LIBZFS_H int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); _LIBZFS_H int zfs_get_holds(zfs_handle_t *, nvlist_t **); _LIBZFS_H uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); _LIBZFS_H int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); _LIBZFS_H int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); _LIBZFS_H int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ boolean_t force; /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; /* * Mark the file systems as "resumable" and do not destroy them if the * receive is interrupted */ boolean_t resumable; /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ boolean_t nomount; /* Was holds flag set in the compound header? */ boolean_t holds; /* skip receive of snapshot holds */ boolean_t skipholds; /* mount the filesystem unless nomount is specified */ boolean_t domount; /* force unmount while recv snapshot (private) */ boolean_t forceunmount; } recvflags_t; _LIBZFS_H int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { ZFS_DIFF_PARSEABLE = 0x1, ZFS_DIFF_TIMESTAMP = 0x2, ZFS_DIFF_CLASSIFY = 0x4 } diff_flags_t; _LIBZFS_H int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, int); /* * Miscellaneous functions. */ _LIBZFS_H const char *zfs_type_to_name(zfs_type_t); _LIBZFS_H void zfs_refresh_properties(zfs_handle_t *); _LIBZFS_H int zfs_name_valid(const char *, zfs_type_t); _LIBZFS_H zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, const char *, zfs_type_t); _LIBZFS_H int zfs_parent_name(zfs_handle_t *, char *, size_t); _LIBZFS_H boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); _LIBZFS_H int zfs_spa_version(zfs_handle_t *, int *); _LIBZFS_H boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. */ _LIBZFS_H boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); _LIBZFS_H boolean_t zfs_is_mounted(zfs_handle_t *, char **); _LIBZFS_H int zfs_mount(zfs_handle_t *, const char *, int); _LIBZFS_H int zfs_mount_at(zfs_handle_t *, const char *, int, const char *); _LIBZFS_H int zfs_unmount(zfs_handle_t *, const char *, int); _LIBZFS_H int zfs_unmountall(zfs_handle_t *, int); _LIBZFS_H int zfs_mount_delegation_check(void); #if defined(__linux__) || defined(__APPLE__) _LIBZFS_H int zfs_parse_mount_options(char *mntopts, unsigned long *mntflags, unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt); _LIBZFS_H void zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, char *mntopts, char *mtabopt); #endif /* * Share support functions. */ _LIBZFS_H boolean_t zfs_is_shared(zfs_handle_t *); _LIBZFS_H int zfs_share(zfs_handle_t *); _LIBZFS_H int zfs_unshare(zfs_handle_t *); /* * Protocol-specific share support functions. */ _LIBZFS_H boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); _LIBZFS_H boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); _LIBZFS_H int zfs_share_nfs(zfs_handle_t *); _LIBZFS_H int zfs_share_smb(zfs_handle_t *); _LIBZFS_H int zfs_shareall(zfs_handle_t *); _LIBZFS_H int zfs_unshare_nfs(zfs_handle_t *, const char *); _LIBZFS_H int zfs_unshare_smb(zfs_handle_t *, const char *); _LIBZFS_H int zfs_unshareall_nfs(zfs_handle_t *); _LIBZFS_H int zfs_unshareall_smb(zfs_handle_t *); _LIBZFS_H int zfs_unshareall_bypath(zfs_handle_t *, const char *); _LIBZFS_H int zfs_unshareall_bytype(zfs_handle_t *, const char *, const char *); _LIBZFS_H int zfs_unshareall(zfs_handle_t *); _LIBZFS_H int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); _LIBZFS_H void zfs_commit_nfs_shares(void); _LIBZFS_H void zfs_commit_smb_shares(void); _LIBZFS_H void zfs_commit_all_shares(void); _LIBZFS_H void zfs_commit_shares(const char *); _LIBZFS_H int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* * Utility functions to run an external process. */ #define STDOUT_VERBOSE 0x01 #define STDERR_VERBOSE 0x02 #define NO_DEFAULT_PATH 0x04 /* Don't use $PATH to lookup the command */ _LIBZFS_H int libzfs_run_process(const char *, char **, int); _LIBZFS_H int libzfs_run_process_get_stdout(const char *, char *[], char *[], char **[], int *); _LIBZFS_H int libzfs_run_process_get_stdout_nopath(const char *, char *[], char *[], char **[], int *); _LIBZFS_H void libzfs_free_str_array(char **, int); _LIBZFS_H int libzfs_envvar_is_set(char *); /* * Utility functions for zfs version */ _LIBZFS_H void zfs_version_userland(char *, int); _LIBZFS_H int zfs_version_kernel(char *, int); _LIBZFS_H int zfs_version_print(void); /* * Given a device or file, determine if it is part of a pool. */ _LIBZFS_H int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* * Label manipulation. */ _LIBZFS_H int zpool_clear_label(int); _LIBZFS_H int zpool_set_bootenv(zpool_handle_t *, const nvlist_t *); _LIBZFS_H int zpool_get_bootenv(zpool_handle_t *, nvlist_t **); /* * Management interfaces for SMB ACL files */ _LIBZFS_H int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); _LIBZFS_H int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); _LIBZFS_H int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); _LIBZFS_H int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); /* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ _LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int); _LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_datasets_os(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_volume_os(const char *); /* * Parse a features file for -o compatibility */ typedef enum { ZPOOL_COMPATIBILITY_OK, ZPOOL_COMPATIBILITY_WARNTOKEN, ZPOOL_COMPATIBILITY_BADTOKEN, ZPOOL_COMPATIBILITY_BADFILE, ZPOOL_COMPATIBILITY_NOFILES } zpool_compat_status_t; _LIBZFS_H zpool_compat_status_t zpool_load_compat(const char *, boolean_t *, char *, size_t); #ifdef __FreeBSD__ /* * Attach/detach the given filesystem to/from the given jail. */ _LIBZFS_H int zfs_jail(zfs_handle_t *zhp, int jailid, int attach); /* * Set loader options for next boot. */ _LIBZFS_H int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); #endif /* __FreeBSD__ */ #ifdef __cplusplus } #endif #endif /* _LIBZFS_H */ diff --git a/sys/contrib/openzfs/include/libzfs_core.h b/sys/contrib/openzfs/include/libzfs_core.h index 83d8211ab615..9020d70db397 100644 --- a/sys/contrib/openzfs/include/libzfs_core.h +++ b/sys/contrib/openzfs/include/libzfs_core.h @@ -1,150 +1,153 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. */ #ifndef _LIBZFS_CORE_H #define _LIBZFS_CORE_H extern __attribute__((visibility("default"))) #include #include #include #include #ifdef __cplusplus extern "C" { #endif _LIBZFS_CORE_H int libzfs_core_init(void); _LIBZFS_CORE_H void libzfs_core_fini(void); +struct zfs_cmd; +_LIBZFS_CORE_H int lzc_ioctl_fd(int, unsigned long, struct zfs_cmd *); + /* * NB: this type should be kept binary-compatible with dmu_objset_type_t. */ enum lzc_dataset_type { LZC_DATSET_TYPE_ZFS = 2, LZC_DATSET_TYPE_ZVOL }; _LIBZFS_CORE_H int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *, uint8_t *, uint_t); _LIBZFS_CORE_H int lzc_clone(const char *, const char *, nvlist_t *); _LIBZFS_CORE_H int lzc_promote(const char *, char *, int); _LIBZFS_CORE_H int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); _LIBZFS_CORE_H int lzc_bookmark(nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_bookmark_props(const char *, nvlist_t **); _LIBZFS_CORE_H int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); _LIBZFS_CORE_H int lzc_unload_key(const char *); _LIBZFS_CORE_H int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); _LIBZFS_CORE_H int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_redact(const char *, const char *, nvlist_t *); _LIBZFS_CORE_H int lzc_snaprange_space(const char *, const char *, uint64_t *); _LIBZFS_CORE_H int lzc_hold(nvlist_t *, int, nvlist_t **); _LIBZFS_CORE_H int lzc_release(nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_holds(const char *, nvlist_t **); enum lzc_send_flags { LZC_SEND_FLAG_EMBED_DATA = 1 << 0, LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1, LZC_SEND_FLAG_COMPRESS = 1 << 2, LZC_SEND_FLAG_RAW = 1 << 3, LZC_SEND_FLAG_SAVED = 1 << 4, }; _LIBZFS_CORE_H int lzc_send(const char *, const char *, int, enum lzc_send_flags); _LIBZFS_CORE_H int lzc_send_resume(const char *, const char *, int, enum lzc_send_flags, uint64_t, uint64_t); _LIBZFS_CORE_H int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); struct dmu_replay_record; _LIBZFS_CORE_H int lzc_send_redacted(const char *, const char *, int, enum lzc_send_flags, const char *); _LIBZFS_CORE_H int lzc_send_resume_redacted(const char *, const char *, int, enum lzc_send_flags, uint64_t, uint64_t, const char *); _LIBZFS_CORE_H int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, boolean_t, int); _LIBZFS_CORE_H int lzc_receive_resumable(const char *, nvlist_t *, const char *, boolean_t, boolean_t, int); _LIBZFS_CORE_H int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *); _LIBZFS_CORE_H int lzc_receive_one(const char *, nvlist_t *, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); _LIBZFS_CORE_H int lzc_send_space_resume_redacted(const char *, const char *, enum lzc_send_flags, uint64_t, uint64_t, uint64_t, const char *, int, uint64_t *); _LIBZFS_CORE_H uint64_t lzc_send_progress(int); _LIBZFS_CORE_H boolean_t lzc_exists(const char *); _LIBZFS_CORE_H int lzc_rollback(const char *, char *, int); _LIBZFS_CORE_H int lzc_rollback_to(const char *, const char *); _LIBZFS_CORE_H int lzc_rename(const char *, const char *); _LIBZFS_CORE_H int lzc_destroy(const char *); _LIBZFS_CORE_H int lzc_channel_program(const char *, const char *, uint64_t, uint64_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_channel_program_nosync(const char *, const char *, uint64_t, uint64_t, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_sync(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_reopen(const char *, boolean_t); _LIBZFS_CORE_H int lzc_pool_checkpoint(const char *); _LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *); _LIBZFS_CORE_H int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); _LIBZFS_CORE_H int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_set_bootenv(const char *, const nvlist_t *); _LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **); #ifdef __cplusplus } #endif #endif /* _LIBZFS_CORE_H */ diff --git a/sys/contrib/openzfs/include/libzutil.h b/sys/contrib/openzfs/include/libzutil.h index ef17bd5426df..c0a660ea7067 100644 --- a/sys/contrib/openzfs/include/libzutil.h +++ b/sys/contrib/openzfs/include/libzutil.h @@ -1,170 +1,179 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved. */ #ifndef _LIBZUTIL_H #define _LIBZUTIL_H extern __attribute__((visibility("default"))) #include #include #ifdef __cplusplus extern "C" { #endif /* * Default wait time for a device name to be created. */ #define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ /* * Pool Config Operations * * These are specific to the library libzfs or libzpool instance. */ typedef nvlist_t *refresh_config_func_t(void *, nvlist_t *); typedef int pool_active_func_t(void *, const char *, uint64_t, boolean_t *); typedef const struct pool_config_ops { refresh_config_func_t *pco_refresh_config; pool_active_func_t *pco_pool_active; } pool_config_ops_t; /* * An instance of pool_config_ops_t is expected in the caller's binary. */ _LIBZUTIL_H const pool_config_ops_t libzfs_config_ops; _LIBZUTIL_H const pool_config_ops_t libzpool_config_ops; typedef struct importargs { char **path; /* a list of paths to search */ int paths; /* number of paths to search */ const char *poolname; /* name of a pool to find */ uint64_t guid; /* guid of a pool to find */ const char *cachefile; /* cachefile to use for import */ boolean_t can_be_active; /* can the pool be active? */ boolean_t scan; /* prefer scanning to libblkid cache */ nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ } importargs_t; _LIBZUTIL_H nvlist_t *zpool_search_import(void *, importargs_t *, const pool_config_ops_t *); _LIBZUTIL_H int zpool_find_config(void *, const char *, nvlist_t **, importargs_t *, const pool_config_ops_t *); _LIBZUTIL_H const char * const * zpool_default_search_paths(size_t *count); _LIBZUTIL_H int zpool_read_label(int, nvlist_t **, int *); _LIBZUTIL_H int zpool_label_disk_wait(const char *, int); struct udev_device; _LIBZUTIL_H int zfs_device_get_devid(struct udev_device *, char *, size_t); _LIBZUTIL_H int zfs_device_get_physical(struct udev_device *, char *, size_t); _LIBZUTIL_H void update_vdev_config_dev_strs(nvlist_t *); /* * Default device paths */ #define DISK_ROOT "/dev" #define UDISK_ROOT "/dev/disk" #define ZVOL_ROOT "/dev/zvol" _LIBZUTIL_H int zfs_append_partition(char *path, size_t max_len); _LIBZUTIL_H int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); _LIBZUTIL_H char *zfs_strip_partition(char *); _LIBZUTIL_H char *zfs_strip_path(char *); _LIBZUTIL_H int zfs_strcmp_pathname(const char *, const char *, int); _LIBZUTIL_H boolean_t zfs_dev_is_dm(const char *); _LIBZUTIL_H boolean_t zfs_dev_is_whole_disk(const char *); _LIBZUTIL_H int zfs_dev_flush(int); _LIBZUTIL_H char *zfs_get_underlying_path(const char *); _LIBZUTIL_H char *zfs_get_enclosure_sysfs_path(const char *); _LIBZUTIL_H boolean_t is_mpath_whole_disk(const char *); _LIBZUTIL_H boolean_t zfs_isnumber(const char *); /* * Formats for iostat numbers. Examples: "12K", "30ms", "4B", "2321234", "-". * * ZFS_NICENUM_1024: Print kilo, mega, tera, peta, exa.. * ZFS_NICENUM_BYTES: Print single bytes ("13B"), kilo, mega, tera... * ZFS_NICENUM_TIME: Print nanosecs, microsecs, millisecs, seconds... * ZFS_NICENUM_RAW: Print the raw number without any formatting * ZFS_NICENUM_RAWTIME: Same as RAW, but print dashes ('-') for zero. */ enum zfs_nicenum_format { ZFS_NICENUM_1024 = 0, ZFS_NICENUM_BYTES = 1, ZFS_NICENUM_TIME = 2, ZFS_NICENUM_RAW = 3, ZFS_NICENUM_RAWTIME = 4 }; /* * Convert a number to a human-readable form. */ _LIBZUTIL_H void zfs_nicebytes(uint64_t, char *, size_t); _LIBZUTIL_H void zfs_nicenum(uint64_t, char *, size_t); _LIBZUTIL_H void zfs_nicenum_format(uint64_t, char *, size_t, enum zfs_nicenum_format); _LIBZUTIL_H void zfs_nicetime(uint64_t, char *, size_t); _LIBZUTIL_H void zfs_niceraw(uint64_t, char *, size_t); #define nicenum(num, buf, size) zfs_nicenum(num, buf, size) _LIBZUTIL_H void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); _LIBZUTIL_H int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); struct zfs_cmd; -_LIBZUTIL_H int zfs_ioctl_fd(int fd, unsigned long request, struct zfs_cmd *zc); /* * List of colors to use */ #define ANSI_RED "\033[0;31m" #define ANSI_YELLOW "\033[0;33m" #define ANSI_RESET "\033[0m" #define ANSI_BOLD "\033[1m" _LIBZUTIL_H void color_start(char *color); _LIBZUTIL_H void color_end(void); _LIBZUTIL_H int printf_color(char *color, char *format, ...); _LIBZUTIL_H const char *zfs_basename(const char *path); _LIBZUTIL_H ssize_t zfs_dirnamelen(const char *path); +/* + * These functions are used by the ZFS libraries and cmd/zpool code, but are + * not exported in the ABI. + */ +typedef int (*pool_vdev_iter_f)(void *, nvlist_t *, void *); +int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data); +int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, + void *data); +void update_vdevs_config_dev_sysfs_path(nvlist_t *config); #ifdef __cplusplus } #endif #endif /* _LIBZUTIL_H */ diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h index 91bc48effe3f..d36a6d2ce7e2 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h @@ -1,160 +1,159 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2014 Xin Li . All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZFS_IOCTL_COMPAT_H #define _SYS_ZFS_IOCTL_COMPAT_H #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * Backwards ioctl compatibility */ /* ioctl versions for vfs.zfs.version.ioctl */ #define ZFS_IOCVER_UNDEF -1 #define ZFS_IOCVER_NONE 0 #define ZFS_IOCVER_DEADMAN 1 #define ZFS_IOCVER_LZC 2 #define ZFS_IOCVER_ZCMD 3 #define ZFS_IOCVER_EDBP 4 #define ZFS_IOCVER_RESUME 5 #define ZFS_IOCVER_INLANES 6 #define ZFS_IOCVER_PAD 7 #define ZFS_IOCVER_LEGACY ZFS_IOCVER_PAD #define ZFS_IOCVER_OZFS 15 /* compatibility conversion flag */ #define ZFS_CMD_COMPAT_NONE 0 #define ZFS_CMD_COMPAT_V15 1 #define ZFS_CMD_COMPAT_V28 2 #define ZFS_CMD_COMPAT_DEADMAN 3 #define ZFS_CMD_COMPAT_LZC 4 #define ZFS_CMD_COMPAT_ZCMD 5 #define ZFS_CMD_COMPAT_EDBP 6 #define ZFS_CMD_COMPAT_RESUME 7 #define ZFS_CMD_COMPAT_INLANES 8 #define ZFS_CMD_COMPAT_LEGACY 9 #define ZFS_IOC_COMPAT_PASS 254 #define ZFS_IOC_COMPAT_FAIL 255 #define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) typedef struct zfs_iocparm { uint32_t zfs_ioctl_version; uint64_t zfs_cmd; uint64_t zfs_cmd_size; } zfs_iocparm_t; #define LEGACY_MAXPATHLEN 1024 #define LEGACY_MAXNAMELEN 256 /* * Note: this struct must have the same layout in 32-bit and 64-bit, so * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit * kernel. Therefore, we add padding to it so that no "hidden" padding * is automatically added on 64-bit (but not on 32-bit). */ typedef struct zfs_cmd_legacy { char zc_name[LEGACY_MAXPATHLEN]; /* pool|dataset name */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[LEGACY_MAXPATHLEN * 2]; char zc_string[LEGACY_MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; dmu_replay_record_t zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad3[3]; boolean_t zc_resumable; uint32_t zc_pad4; uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_legacy_t; #ifdef _KERNEL int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, const int); nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, const int); #endif /* _KERNEL */ int zfs_ioctl_legacy_to_ozfs(int request); int zfs_ioctl_ozfs_to_legacy(int request); void zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst); -void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); void zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst); void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_COMPAT_H */ diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index 10e29a45c89f..942ab9b10835 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -1,1074 +1,1076 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct dsl_crypto_params; struct locked_range; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef struct dnode dnode_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_ENCRYPTED 0x20 #define DMU_OT_BYTESWAP_MASK 0x1f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). All of the types created by this method * are cached in the dbuf metadata cache. */ #define DMU_OT(byteswap, metadata, encrypted) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((encrypted) ? DMU_OT_ENCRYPTED : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) /* * MDB doesn't have dmu_ot; it defines these macros itself. */ #ifndef ZFS_MDB #define DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata) #define DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt) #define DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap) #endif #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ #define DMU_OT_IS_FILE(ot) \ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) #define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_ENCRYPTED) : \ DMU_OT_IS_ENCRYPTED_IMPL(ot)) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ DMU_OT_BYTESWAP_IMPL(ot)) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE), DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE), DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE), DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE), DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE), DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE), DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE), DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE), DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE), DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE), DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE), } dmu_object_type_t; /* * These flags are intended to be used to specify the "txg_how" * parameter when calling the dmu_tx_assign() function. See the comment * above dmu_tx_assign() for more details on the meaning of these flags. */ #define TXG_NOWAIT (0ULL) #define TXG_WAIT (1ULL<<0) #define TXG_NOTHROTTLE (1ULL<<1) void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) #define DMU_PROJECTUSED_OBJECT (-3ULL) /* * Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT. */ #define DMU_OBJACCT_PREFIX "obj-" #define DMU_OBJACCT_PREFIX_LEN 4 /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t key_required, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, boolean_t key_required, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func, void *arg); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" #define DMU_POOL_REMOVING "com.delphix:removing" #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" #define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx); int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the number of levels on a dnode. nlevels must be greater than the * current number of levels or an EINVAL will be returned. */ int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx); /* * Set the data blocksize for an object. * * The object cannot have any blocks allocated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly * to accommodate the change. When calling this function, the caller must * ensure that the object's nlevels can sufficiently support the new maxblkid. */ int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_will_dirty() * before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release what you hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp); int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, uint32_t flags); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You must not access the dmu_buf_t after releasing * what you hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); +int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); uint64_t dmu_buf_user_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* * This instance's eviction function pointers. * * dbu_evict_func_sync is called synchronously and then * dbu_evict_func_async is executed asynchronously on a taskq. */ dmu_buf_evict_func_t *dbu_evict_func_sync; dmu_buf_evict_func_t *dbu_evict_func_async; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ /*ARGSUSED*/ static inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp __maybe_unused) { ASSERT(dbu->dbu_evict_func_sync == NULL); ASSERT(dbu->dbu_evict_func_async == NULL); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); dbu->dbu_evict_func_sync = evict_func_sync; dbu->dbu_evict_func_async = evict_func_async; taskq_init_ent(&dbu->dbu_tqent); #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. * * When multiple callbacks are registered to the transaction, the callbacks * will be called in reverse order to let Lustre, the only user of commit * callback currently, take the fast path of its commit callback handling. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); void dmu_tx_do_callbacks(list_t *cb_list, int error); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ #define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf extern int zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_dnodesize; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void (*const arc_byteswap_func_t)(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; boolean_t ot_dbuf_metadata_cache; boolean_t ot_encrypt; char *ot_name; } dmu_object_type_info_t; typedef const struct dmu_object_byteswap_info { arc_byteswap_func_t ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; uint8_t dds_redacted; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_objset_blksize(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef struct zfs_file_info { uint64_t zfi_user; uint64_t zfi_group; uint64_t zfi_project; uint64_t zfi_generation; } zfs_file_info_t; typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data, struct zfs_file_info *zoi); extern void dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct zfs_locked_range *zgd_lr; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, zfs_file_t *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ diff --git a/sys/contrib/openzfs/include/sys/zfs_refcount.h b/sys/contrib/openzfs/include/sys/zfs_refcount.h index 1e6449472e38..2f59ebb32b07 100644 --- a/sys/contrib/openzfs/include/sys/zfs_refcount.h +++ b/sys/contrib/openzfs/include/sys/zfs_refcount.h @@ -1,126 +1,134 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_REFCOUNT_H #define _SYS_ZFS_REFCOUNT_H #include #include #include #ifdef __cplusplus extern "C" { #endif /* * If the reference is held only by the calling function and not any * particular object, use FTAG (which is a string) for the holder_tag. * Otherwise, use the object that holds the reference. */ #define FTAG ((char *)(uintptr_t)__func__) #ifdef ZFS_DEBUG typedef struct reference { list_node_t ref_link; const void *ref_holder; uint64_t ref_number; uint8_t *ref_removed; } reference_t; typedef struct refcount { kmutex_t rc_mtx; boolean_t rc_tracked; list_t rc_list; list_t rc_removed; uint64_t rc_count; uint64_t rc_removed_count; } zfs_refcount_t; /* * Note: zfs_refcount_t must be initialized with * refcount_create[_untracked]() */ void zfs_refcount_create(zfs_refcount_t *); void zfs_refcount_create_untracked(zfs_refcount_t *); void zfs_refcount_create_tracked(zfs_refcount_t *); void zfs_refcount_destroy(zfs_refcount_t *); void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t); int zfs_refcount_is_zero(zfs_refcount_t *); int64_t zfs_refcount_count(zfs_refcount_t *); int64_t zfs_refcount_add(zfs_refcount_t *, const void *); int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); +/* + * Note that (add|remove)_many add/remove one reference with "number" N, + * _not_ make N references with "number" 1, which is what vanilla + * zfs_refcount_(add|remove) would do if called N times. + * + * Attempting to remove a reference with number N when none exists is a + * panic on debug kernels with reference_tracking enabled. + */ int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); void zfs_refcount_transfer_ownership(zfs_refcount_t *, const void *, const void *); void zfs_refcount_transfer_ownership_many(zfs_refcount_t *, uint64_t, const void *, const void *); boolean_t zfs_refcount_held(zfs_refcount_t *, const void *); boolean_t zfs_refcount_not_held(zfs_refcount_t *, const void *); void zfs_refcount_init(void); void zfs_refcount_fini(void); #else /* ZFS_DEBUG */ typedef struct refcount { uint64_t rc_count; } zfs_refcount_t; #define zfs_refcount_create(rc) ((rc)->rc_count = 0) #define zfs_refcount_create_untracked(rc) ((rc)->rc_count = 0) #define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0) #define zfs_refcount_destroy(rc) ((rc)->rc_count = 0) #define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0) #define zfs_refcount_is_zero(rc) (zfs_refcount_count(rc) == 0) #define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) #define zfs_refcount_add_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, number) #define zfs_refcount_remove_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, -number) #define zfs_refcount_transfer(dst, src) { \ uint64_t __tmp = zfs_refcount_count(src); \ atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } #define zfs_refcount_transfer_ownership(rc, ch, nh) ((void)0) #define zfs_refcount_transfer_ownership_many(rc, nr, ch, nh) ((void)0) #define zfs_refcount_held(rc, holder) (zfs_refcount_count(rc) > 0) #define zfs_refcount_not_held(rc, holder) (B_TRUE) #define zfs_refcount_init() #define zfs_refcount_fini() #endif /* ZFS_DEBUG */ #ifdef __cplusplus } #endif #endif /* _SYS_REFCOUNT_H */ diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index 5b606eaf8d50..b3589e9b0312 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -1,692 +1,692 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019-2020, Michael Niewöhner */ #ifndef _ZIO_H #define _ZIO_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Embedded checksum */ #define ZEC_MAGIC 0x210da7ab10c7a11ULL typedef struct zio_eck { uint64_t zec_magic; /* for validation, endianness */ zio_cksum_t zec_cksum; /* 256-bit checksum */ } zio_eck_t; /* * Gang block headers are self-checksumming and contain an array * of block pointers. */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, ZIO_CHECKSUM_ON, ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_LABEL, ZIO_CHECKSUM_GANG_HEADER, ZIO_CHECKSUM_ZILOG, ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, #if !defined(__FreeBSD__) ZIO_CHECKSUM_EDONR, #endif ZIO_CHECKSUM_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL -#define ZIO_CHECKSUM_VERIFY (1 << 8) +#define ZIO_CHECKSUM_VERIFY (1U << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 /* macros defining encryption lengths */ #define ZIO_OBJSET_MAC_LEN 32 #define ZIO_DATA_IV_LEN 12 #define ZIO_DATA_SALT_LEN 8 #define ZIO_DATA_MAC_LEN 16 /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 /* * The meaning of "compress = on" selected by the compression features enabled * on a given pool. */ #define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF #define BOOTFS_COMPRESS_VALID(compress) \ ((compress) == ZIO_COMPRESS_LZJB || \ (compress) == ZIO_COMPRESS_LZ4 || \ (compress) == ZIO_COMPRESS_GZIP_1 || \ (compress) == ZIO_COMPRESS_GZIP_2 || \ (compress) == ZIO_COMPRESS_GZIP_3 || \ (compress) == ZIO_COMPRESS_GZIP_4 || \ (compress) == ZIO_COMPRESS_GZIP_5 || \ (compress) == ZIO_COMPRESS_GZIP_6 || \ (compress) == ZIO_COMPRESS_GZIP_7 || \ (compress) == ZIO_COMPRESS_GZIP_8 || \ (compress) == ZIO_COMPRESS_GZIP_9 || \ (compress) == ZIO_COMPRESS_ZLE || \ (compress) == ZIO_COMPRESS_ZSTD || \ (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) #define ZIO_COMPRESS_ALGO(x) (x & SPA_COMPRESSMASK) #define ZIO_COMPRESS_LEVEL(x) ((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS) #define ZIO_COMPRESS_RAW(type, level) (type | ((level) << SPA_COMPRESSBITS)) #define ZIO_COMPLEVEL_ZSTD(level) \ ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level) #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 typedef enum zio_suspend_reason { ZIO_SUSPEND_NONE = 0, ZIO_SUSPEND_IOERR, ZIO_SUSPEND_MMP, } zio_suspend_reason_t; enum zio_flag { /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ - ZIO_FLAG_DONT_AGGREGATE = 1 << 0, - ZIO_FLAG_IO_REPAIR = 1 << 1, - ZIO_FLAG_SELF_HEAL = 1 << 2, - ZIO_FLAG_RESILVER = 1 << 3, - ZIO_FLAG_SCRUB = 1 << 4, - ZIO_FLAG_SCAN_THREAD = 1 << 5, - ZIO_FLAG_PHYSICAL = 1 << 6, + ZIO_FLAG_DONT_AGGREGATE = 1U << 0, + ZIO_FLAG_IO_REPAIR = 1U << 1, + ZIO_FLAG_SELF_HEAL = 1U << 2, + ZIO_FLAG_RESILVER = 1U << 3, + ZIO_FLAG_SCRUB = 1U << 4, + ZIO_FLAG_SCAN_THREAD = 1U << 5, + ZIO_FLAG_PHYSICAL = 1U << 6, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ - ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ - ZIO_FLAG_SPECULATIVE = 1 << 8, - ZIO_FLAG_CONFIG_WRITER = 1 << 9, - ZIO_FLAG_DONT_RETRY = 1 << 10, - ZIO_FLAG_DONT_CACHE = 1 << 11, - ZIO_FLAG_NODATA = 1 << 12, - ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, - ZIO_FLAG_IO_ALLOCATING = 1 << 14, + ZIO_FLAG_CANFAIL = 1U << 7, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1U << 8, + ZIO_FLAG_CONFIG_WRITER = 1U << 9, + ZIO_FLAG_DONT_RETRY = 1U << 10, + ZIO_FLAG_DONT_CACHE = 1U << 11, + ZIO_FLAG_NODATA = 1U << 12, + ZIO_FLAG_INDUCE_DAMAGE = 1U << 13, + ZIO_FLAG_IO_ALLOCATING = 1U << 14, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 16, - ZIO_FLAG_TRYHARD = 1 << 17, - ZIO_FLAG_OPTIONAL = 1 << 18, + ZIO_FLAG_IO_RETRY = 1U << 15, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1U << 16, + ZIO_FLAG_TRYHARD = 1U << 17, + ZIO_FLAG_OPTIONAL = 1U << 18, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 20, - ZIO_FLAG_IO_BYPASS = 1 << 21, - ZIO_FLAG_IO_REWRITE = 1 << 22, - ZIO_FLAG_RAW_COMPRESS = 1 << 23, - ZIO_FLAG_RAW_ENCRYPT = 1 << 24, - ZIO_FLAG_GANG_CHILD = 1 << 25, - ZIO_FLAG_DDT_CHILD = 1 << 26, - ZIO_FLAG_GODFATHER = 1 << 27, - ZIO_FLAG_NOPWRITE = 1 << 28, - ZIO_FLAG_REEXECUTED = 1 << 29, - ZIO_FLAG_DELEGATED = 1 << 30, - ZIO_FLAG_FASTWRITE = 1 << 31, + ZIO_FLAG_DONT_QUEUE = 1U << 19, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1U << 20, + ZIO_FLAG_IO_BYPASS = 1U << 21, + ZIO_FLAG_IO_REWRITE = 1U << 22, + ZIO_FLAG_RAW_COMPRESS = 1U << 23, + ZIO_FLAG_RAW_ENCRYPT = 1U << 24, + ZIO_FLAG_GANG_CHILD = 1U << 25, + ZIO_FLAG_DDT_CHILD = 1U << 26, + ZIO_FLAG_GODFATHER = 1U << 27, + ZIO_FLAG_NOPWRITE = 1U << 28, + ZIO_FLAG_REEXECUTED = 1U << 29, + ZIO_FLAG_DELEGATED = 1U << 30, + ZIO_FLAG_FASTWRITE = 1U << 31, }; #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) -#define ZIO_CHILD_BIT(x) (1 << (x)) -#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) +#define ZIO_CHILD_BIT(x) (1U << (x)) +#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x))) enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; #define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) #define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) #define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) #define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) #define ZIO_CHILD_ALL_BITS \ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) enum zio_wait_type { ZIO_WAIT_READY = 0, ZIO_WAIT_DONE, ZIO_WAIT_TYPES }; typedef void zio_done_func_t(zio_t *zio); extern int zio_exclude_metadata; extern int zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) * is objset 0, and the meta-dnode is object 0. This covers all blocks * except root blocks and ZIL blocks, which are defined as follows: * * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . * dnode visit bookmarks are . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel, and is * stored on disk (by virtue of being incorporated into other on-disk * structures, e.g. dsl_scan_phys_t). */ struct zbookmark_phys { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; }; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } #define ZB_DESTROYED_OBJSET (-1ULL) #define ZB_ROOT_OBJECT (0ULL) #define ZB_ROOT_LEVEL (-1LL) #define ZB_ROOT_BLKID (0ULL) #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) #define ZB_DNODE_LEVEL (-3LL) #define ZB_DNODE_BLKID (0ULL) #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) #define ZB_IS_ROOT(zb) \ ((zb)->zb_object == ZB_ROOT_OBJECT && \ (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; uint8_t zp_complevel; dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, const abd_t *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; nvlist_t *zcr_ereport; nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ uint64_t zcr_sector; uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; zio_cksum_free_f *zcr_free; /* internal use only */ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; } zio_vsd_ops_t; typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, struct abd *data, uint64_t offset); typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; struct zio_transform *zt_next; } zio_transform_t; typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must * be able to propagate them to the parent. The normal io_flags are local * to the zio, not protected by any lock, and not modifiable by children; * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 /* * The io_trim flags are used to specify the type of TRIM to perform. They * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags. */ enum trim_flag { - ZIO_TRIM_SECURE = 1 << 0, + ZIO_TRIM_SECURE = 1U << 0, }; typedef struct zio_alloc_list { list_t zal_list; uint64_t zal_size; } zio_alloc_list_t; typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; list_node_t zl_parent_node; list_node_t zl_child_node; } zio_link_t; struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; enum trim_flag io_trim_flags; int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; zio_t *io_logical; zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_children_ready; zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* io_lsize != io_orig_size iff this is a raw write */ uint64_t io_lsize; /* Data represented by this I/O */ struct abd *io_abd; struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ enum zio_flag io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; void *io_bio; kmutex_t io_lock; kcondvar_t io_cv; int io_allocator; /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; /* Taskq dispatching state */ taskq_ent_t io_tqent; }; enum blk_verify_flag { BLK_VERIFY_ONLY, BLK_VERIFY_LOG, BLK_VERIFY_HALT }; extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, enum trim_flag trim_flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(void *zio); extern void zio_interrupt(void *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); extern void zio_deadman(zio_t *zio, char *tag); extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent); extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child, uint8_t parent); extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, enum blk_verify_flag blk_verify); /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); /* * Fault injection */ struct zinject_record; extern uint32_t zio_injection_enabled; extern int zio_inject_fault(char *name, int flags, int *id, struct zinject_record *record); extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, uint64_t type, int error); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info); void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr); extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } #endif #endif /* _ZIO_H */ diff --git a/sys/contrib/openzfs/lib/Makefile.am b/sys/contrib/openzfs/lib/Makefile.am index db7a3fa31d40..f07975cc03fc 100644 --- a/sys/contrib/openzfs/lib/Makefile.am +++ b/sys/contrib/openzfs/lib/Makefile.am @@ -1,43 +1,82 @@ +# +# Shown below is a simplified dependency graph of the OpenZFS provided +# libraries. Administrative commands (`zfs`, `zpool`, etc) interface with +# the kernel modules using the `libzfs.so` and `libzfs_core.so` libraries. +# These libraries provide a stable ABI across OpenZFS point releases. +# +# The `libzpool.so` library is a user space build of the DMU and SPA layers +# used to implement debugging tools (zdb) and code validation tools (ztest). +# These library interfaces are subject to change at any time. +# +# +# CMDS: zhack/ztest/zdb/ zfs/zpool/zed/ +# raidz_{test,bench} zinject/zstream +# | | +# LIBS: | | libzfsbootenv* +# | | | +# | | | +# libzpool libzfs* ----------------+ +# | | | \ / | | | +# libicp --/ | | \ / | | \------- libshare +# | | \ / | | +# libzstd ---/ | \ / | \--------- libuutil +# | \ / \ | | +# libunicode --/ \ / \ | | +# \ / \ | | +# libzutil libzfs_core* | | +# | | | | \ | | | | +# | | | | | | | | | +# | | | | | | | | | +# libtpool -------------/ | | | \---- libnvpair* | | | +# | | | | | | +# libefi -----------------/ | \------ libavl* --------/ | +# | | | +# \-------- libspl ----+------/ +# +# * - A stable ABI is provided for these libraries +# +# # NB: GNU Automake Manual, Chapter 8.3.5: Libtool Convenience Libraries # These nine libraries are intermediary build components. +# SUBDIRS = libavl libicp libshare libspl libtpool libzstd CPPCHECKDIRS = libavl libicp libnvpair libshare libspl libtpool libunicode CPPCHECKDIRS += libuutil libzfs libzfs_core libzfsbootenv libzpool libzutil if BUILD_LINUX SUBDIRS += libefi CPPCHECKDIRS += libefi endif # libnvpair is installed as part of the final build product # libzutil depends on it, so it must be compiled before libzutil SUBDIRS += libnvpair # libzutil depends on libefi if present SUBDIRS += libzutil libunicode # These five libraries, which are installed as the final build product, # incorporate the eight convenience libraries given above. DISTLIBS = libuutil libzfs_core libzfs libzpool libzfsbootenv SUBDIRS += $(DISTLIBS) DISTLIBS += libnvpair # An ABI is stored for each of these libraries. Note that libzpool.so # is only linked against by ztest and zdb and no stable ABI is provided. ABILIBS = libnvpair libuutil libzfs_core libzfs libzfsbootenv PHONY = checkabi storeabi cppcheck checkabi: $(ABILIBS) set -e ; for dir in $(ABILIBS) ; do \ $(MAKE) -C $$dir checkabi ; \ done storeabi: $(ABILIBS) set -e ; for dir in $(ABILIBS) ; do \ $(MAKE) -C $$dir storeabi ; \ done cppcheck: $(CPPCHECKDIRS) set -e ; for dir in $(CPPCHECKDIRS) ; do \ $(MAKE) -C $$dir cppcheck ; \ done diff --git a/sys/contrib/openzfs/lib/libshare/os/linux/nfs.c b/sys/contrib/openzfs/lib/libshare/os/linux/nfs.c index bd578adeec5d..4f754aabd3a3 100644 --- a/sys/contrib/openzfs/lib/libshare/os/linux/nfs.c +++ b/sys/contrib/openzfs/lib/libshare/os/linux/nfs.c @@ -1,606 +1,643 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner * Copyright (c) 2012 Cyril Plisko. All rights reserved. * Copyright (c) 2019, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libshare_impl.h" #include "nfs.h" #define ZFS_EXPORTS_DIR "/etc/exports.d" #define ZFS_EXPORTS_FILE ZFS_EXPORTS_DIR"/zfs.exports" #define ZFS_EXPORTS_LOCK ZFS_EXPORTS_FILE".lock" static sa_fstype_t *nfs_fstype; typedef int (*nfs_shareopt_callback_t)(const char *opt, const char *value, void *cookie); typedef int (*nfs_host_callback_t)(const char *sharepath, const char *filename, const char *host, const char *security, const char *access, void *cookie); /* * Invokes the specified callback function for each Solaris share option * listed in the specified string. */ static int foreach_nfs_shareopt(const char *shareopts, nfs_shareopt_callback_t callback, void *cookie) { char *shareopts_dup, *opt, *cur, *value; int was_nul, error; if (shareopts == NULL) return (SA_OK); if (strcmp(shareopts, "on") == 0) shareopts = "rw,crossmnt"; shareopts_dup = strdup(shareopts); if (shareopts_dup == NULL) return (SA_NO_MEMORY); opt = shareopts_dup; was_nul = 0; while (1) { cur = opt; while (*cur != ',' && *cur != '\0') cur++; if (*cur == '\0') was_nul = 1; *cur = '\0'; if (cur > opt) { value = strchr(opt, '='); if (value != NULL) { *value = '\0'; value++; } error = callback(opt, value, cookie); if (error != SA_OK) { free(shareopts_dup); return (error); } } opt = cur + 1; if (was_nul) break; } free(shareopts_dup); return (SA_OK); } typedef struct nfs_host_cookie_s { nfs_host_callback_t callback; const char *sharepath; void *cookie; const char *filename; const char *security; } nfs_host_cookie_t; /* * Helper function for foreach_nfs_host. This function checks whether the * current share option is a host specification and invokes a callback * function with information about the host. */ static int foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) { int error; const char *access; - char *host_dup, *host, *next; + char *host_dup, *host, *next, *v6Literal; nfs_host_cookie_t *udata = (nfs_host_cookie_t *)pcookie; + int cidr_len; #ifdef DEBUG fprintf(stderr, "foreach_nfs_host_cb: key=%s, value=%s\n", opt, value); #endif if (strcmp(opt, "sec") == 0) udata->security = value; if (strcmp(opt, "rw") == 0 || strcmp(opt, "ro") == 0) { if (value == NULL) value = "*"; access = opt; host_dup = strdup(value); if (host_dup == NULL) return (SA_NO_MEMORY); host = host_dup; do { - next = strchr(host, ':'); - if (next != NULL) { - *next = '\0'; - next++; + if (*host == '[') { + host++; + v6Literal = strchr(host, ']'); + if (v6Literal == NULL) { + free(host_dup); + return (SA_SYNTAX_ERR); + } + if (v6Literal[1] == '\0') { + *v6Literal = '\0'; + next = NULL; + } else if (v6Literal[1] == '/') { + next = strchr(v6Literal + 2, ':'); + if (next == NULL) { + cidr_len = + strlen(v6Literal + 1); + memmove(v6Literal, + v6Literal + 1, + cidr_len); + v6Literal[cidr_len] = '\0'; + } else { + cidr_len = next - v6Literal - 1; + memmove(v6Literal, + v6Literal + 1, + cidr_len); + v6Literal[cidr_len] = '\0'; + next++; + } + } else if (v6Literal[1] == ':') { + *v6Literal = '\0'; + next = v6Literal + 2; + } else { + free(host_dup); + return (SA_SYNTAX_ERR); + } + } else { + next = strchr(host, ':'); + if (next != NULL) { + *next = '\0'; + next++; + } } error = udata->callback(udata->filename, udata->sharepath, host, udata->security, access, udata->cookie); if (error != SA_OK) { free(host_dup); return (error); } host = next; } while (host != NULL); free(host_dup); } return (SA_OK); } /* * Invokes a callback function for all NFS hosts that are set for a share. */ static int foreach_nfs_host(sa_share_impl_t impl_share, char *filename, nfs_host_callback_t callback, void *cookie) { nfs_host_cookie_t udata; char *shareopts; udata.callback = callback; udata.sharepath = impl_share->sa_mountpoint; udata.cookie = cookie; udata.filename = filename; udata.security = "sys"; shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; return (foreach_nfs_shareopt(shareopts, foreach_nfs_host_cb, &udata)); } /* * Converts a Solaris NFS host specification to its Linux equivalent. */ static int get_linux_hostspec(const char *solaris_hostspec, char **plinux_hostspec) { /* * For now we just support CIDR masks (e.g. @192.168.0.0/16) and host * wildcards (e.g. *.example.org). */ if (solaris_hostspec[0] == '@') { /* * Solaris host specifier, e.g. @192.168.0.0/16; we just need * to skip the @ in this case */ *plinux_hostspec = strdup(solaris_hostspec + 1); } else { *plinux_hostspec = strdup(solaris_hostspec); } if (*plinux_hostspec == NULL) { return (SA_NO_MEMORY); } return (SA_OK); } /* * Adds a Linux share option to an array of NFS options. */ static int add_linux_shareopt(char **plinux_opts, const char *key, const char *value) { size_t len = 0; char *new_linux_opts; if (*plinux_opts != NULL) len = strlen(*plinux_opts); new_linux_opts = realloc(*plinux_opts, len + 1 + strlen(key) + (value ? 1 + strlen(value) : 0) + 1); if (new_linux_opts == NULL) return (SA_NO_MEMORY); new_linux_opts[len] = '\0'; if (len > 0) strcat(new_linux_opts, ","); strcat(new_linux_opts, key); if (value != NULL) { strcat(new_linux_opts, "="); strcat(new_linux_opts, value); } *plinux_opts = new_linux_opts; return (SA_OK); } /* * Validates and converts a single Solaris share option to its Linux * equivalent. */ static int get_linux_shareopts_cb(const char *key, const char *value, void *cookie) { char **plinux_opts = (char **)cookie; /* host-specific options, these are taken care of elsewhere */ if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0 || strcmp(key, "sec") == 0) return (SA_OK); if (strcmp(key, "anon") == 0) key = "anonuid"; if (strcmp(key, "root_mapping") == 0) { (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); key = "anonuid"; } if (strcmp(key, "nosub") == 0) key = "subtree_check"; if (strcmp(key, "insecure") != 0 && strcmp(key, "secure") != 0 && strcmp(key, "async") != 0 && strcmp(key, "sync") != 0 && strcmp(key, "no_wdelay") != 0 && strcmp(key, "wdelay") != 0 && strcmp(key, "nohide") != 0 && strcmp(key, "hide") != 0 && strcmp(key, "crossmnt") != 0 && strcmp(key, "no_subtree_check") != 0 && strcmp(key, "subtree_check") != 0 && strcmp(key, "insecure_locks") != 0 && strcmp(key, "secure_locks") != 0 && strcmp(key, "no_auth_nlm") != 0 && strcmp(key, "auth_nlm") != 0 && strcmp(key, "no_acl") != 0 && strcmp(key, "mountpoint") != 0 && strcmp(key, "mp") != 0 && strcmp(key, "fsuid") != 0 && strcmp(key, "refer") != 0 && strcmp(key, "replicas") != 0 && strcmp(key, "root_squash") != 0 && strcmp(key, "no_root_squash") != 0 && strcmp(key, "all_squash") != 0 && strcmp(key, "no_all_squash") != 0 && strcmp(key, "fsid") != 0 && strcmp(key, "anonuid") != 0 && strcmp(key, "anongid") != 0) { return (SA_SYNTAX_ERR); } (void) add_linux_shareopt(plinux_opts, key, value); return (SA_OK); } /* * Takes a string containing Solaris share options (e.g. "sync,no_acl") and * converts them to a NULL-terminated array of Linux NFS options. */ static int get_linux_shareopts(const char *shareopts, char **plinux_opts) { int error; assert(plinux_opts != NULL); *plinux_opts = NULL; /* no_subtree_check - Default as of nfs-utils v1.1.0 */ (void) add_linux_shareopt(plinux_opts, "no_subtree_check", NULL); /* mountpoint - Restrict exports to ZFS mountpoints */ (void) add_linux_shareopt(plinux_opts, "mountpoint", NULL); error = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, plinux_opts); if (error != SA_OK) { free(*plinux_opts); *plinux_opts = NULL; } return (error); } /* * This function populates an entry into /etc/exports.d/zfs.exports. * This file is consumed by the linux nfs server so that zfs shares are * automatically exported upon boot or whenever the nfs server restarts. */ static int nfs_add_entry(const char *filename, const char *sharepath, const char *host, const char *security, const char *access_opts, void *pcookie) { int error; char *linuxhost; const char *linux_opts = (const char *)pcookie; error = get_linux_hostspec(host, &linuxhost); if (error != SA_OK) return (error); if (linux_opts == NULL) linux_opts = ""; FILE *fp = fopen(filename, "a+e"); if (fp == NULL) { fprintf(stderr, "failed to open %s file: %s", filename, strerror(errno)); free(linuxhost); return (SA_SYSTEM_ERR); } if (fprintf(fp, "%s %s(sec=%s,%s,%s)\n", sharepath, linuxhost, security, access_opts, linux_opts) < 0) { fprintf(stderr, "failed to write to %s\n", filename); free(linuxhost); fclose(fp); return (SA_SYSTEM_ERR); } free(linuxhost); if (fclose(fp) != 0) { fprintf(stderr, "Unable to close file %s: %s\n", filename, strerror(errno)); return (SA_SYSTEM_ERR); } return (SA_OK); } /* * This function copies all entries from the exports file to "filename", * omitting any entries for the specified mountpoint. */ int nfs_copy_entries(char *filename, const char *mountpoint) { char *buf = NULL; size_t buflen = 0; int error = SA_OK; FILE *oldfp = fopen(ZFS_EXPORTS_FILE, "re"); FILE *newfp = fopen(filename, "w+e"); if (newfp == NULL) { fprintf(stderr, "failed to open %s file: %s", filename, strerror(errno)); fclose(oldfp); return (SA_SYSTEM_ERR); } fputs(FILE_HEADER, newfp); /* * The ZFS_EXPORTS_FILE may not exist yet. If that's the * case then just write out the new file. */ if (oldfp != NULL) { while (getline(&buf, &buflen, oldfp) != -1) { char *space = NULL; if (buf[0] == '\n' || buf[0] == '#') continue; if ((space = strchr(buf, ' ')) != NULL) { int mountpoint_len = strlen(mountpoint); if (space - buf == mountpoint_len && strncmp(mountpoint, buf, mountpoint_len) == 0) { continue; } } fputs(buf, newfp); } if (ferror(oldfp) != 0) { error = ferror(oldfp); } if (fclose(oldfp) != 0) { fprintf(stderr, "Unable to close file %s: %s\n", filename, strerror(errno)); error = error != 0 ? error : SA_SYSTEM_ERR; } } if (error == 0 && ferror(newfp) != 0) { error = ferror(newfp); } free(buf); if (fclose(newfp) != 0) { fprintf(stderr, "Unable to close file %s: %s\n", filename, strerror(errno)); error = error != 0 ? error : SA_SYSTEM_ERR; } return (error); } /* * Enables NFS sharing for the specified share. */ static int nfs_enable_share_impl(sa_share_impl_t impl_share, char *filename) { char *shareopts, *linux_opts; int error; shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; error = get_linux_shareopts(shareopts, &linux_opts); if (error != SA_OK) return (error); error = foreach_nfs_host(impl_share, filename, nfs_add_entry, linux_opts); free(linux_opts); return (error); } static int nfs_enable_share(sa_share_impl_t impl_share) { return (nfs_toggle_share( ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, ZFS_EXPORTS_DIR, impl_share, nfs_enable_share_impl)); } /* * Disables NFS sharing for the specified share. */ static int nfs_disable_share_impl(sa_share_impl_t impl_share, char *filename) { return (SA_OK); } static int nfs_disable_share(sa_share_impl_t impl_share) { return (nfs_toggle_share( ZFS_EXPORTS_LOCK, ZFS_EXPORTS_FILE, ZFS_EXPORTS_DIR, impl_share, nfs_disable_share_impl)); } static boolean_t nfs_is_shared(sa_share_impl_t impl_share) { size_t buflen = 0; char *buf = NULL; FILE *fp = fopen(ZFS_EXPORTS_FILE, "re"); if (fp == NULL) { return (B_FALSE); } while ((getline(&buf, &buflen, fp)) != -1) { char *space = NULL; if ((space = strchr(buf, ' ')) != NULL) { int mountpoint_len = strlen(impl_share->sa_mountpoint); if (space - buf == mountpoint_len && strncmp(impl_share->sa_mountpoint, buf, mountpoint_len) == 0) { fclose(fp); free(buf); return (B_TRUE); } } } free(buf); fclose(fp); return (B_FALSE); } /* * Checks whether the specified NFS share options are syntactically correct. */ static int nfs_validate_shareopts(const char *shareopts) { char *linux_opts; int error; error = get_linux_shareopts(shareopts, &linux_opts); if (error != SA_OK) return (error); free(linux_opts); return (SA_OK); } static int nfs_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) { FSINFO(impl_share, nfs_fstype)->shareopts = (char *)shareopts; return (SA_OK); } /* * Clears a share's NFS options. Used by libshare to * clean up shares that are about to be free()'d. */ static void nfs_clear_shareopts(sa_share_impl_t impl_share) { FSINFO(impl_share, nfs_fstype)->shareopts = NULL; } static int nfs_commit_shares(void) { char *argv[] = { "/usr/sbin/exportfs", "-ra", NULL }; return (libzfs_run_process(argv[0], argv, 0)); } static const sa_share_ops_t nfs_shareops = { .enable_share = nfs_enable_share, .disable_share = nfs_disable_share, .is_shared = nfs_is_shared, .validate_shareopts = nfs_validate_shareopts, .update_shareopts = nfs_update_shareopts, .clear_shareopts = nfs_clear_shareopts, .commit_shares = nfs_commit_shares, }; /* * Initializes the NFS functionality of libshare. */ void libshare_nfs_init(void) { nfs_fstype = register_fstype("nfs", &nfs_shareops); } diff --git a/sys/contrib/openzfs/lib/libspl/page.c b/sys/contrib/openzfs/lib/libspl/page.c index ad9aad998231..5b0d3f2e5786 100644 --- a/sys/contrib/openzfs/lib/libspl/page.c +++ b/sys/contrib/openzfs/lib/libspl/page.c @@ -1,34 +1,35 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include +#include static size_t pagesize = 0; size_t spl_pagesize(void) { if (pagesize == 0) pagesize = sysconf(_SC_PAGESIZE); return (pagesize); } diff --git a/sys/contrib/openzfs/lib/libzfs/Makefile.am b/sys/contrib/openzfs/lib/libzfs/Makefile.am index e3527ffe7058..e23f7c162afb 100644 --- a/sys/contrib/openzfs/lib/libzfs/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs/Makefile.am @@ -1,100 +1,100 @@ include $(top_srcdir)/config/Rules.am VPATH = \ $(top_srcdir)/module/icp \ $(top_srcdir)/module/zcommon \ $(top_srcdir)/lib/libzfs # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) AM_CFLAGS += $(LIBCRYPTO_CFLAGS) $(ZLIB_CFLAGS) AM_CFLAGS += -fvisibility=hidden pkgconfig_DATA = libzfs.pc lib_LTLIBRARIES = libzfs.la include $(top_srcdir)/config/Abigail.am USER_C = \ libzfs_impl.h \ libzfs_changelist.c \ libzfs_config.c \ libzfs_crypto.c \ libzfs_dataset.c \ libzfs_diff.c \ libzfs_import.c \ libzfs_iter.c \ libzfs_mount.c \ libzfs_pool.c \ libzfs_sendrecv.c \ libzfs_status.c \ libzfs_util.c if BUILD_FREEBSD USER_C += \ os/freebsd/libzfs_compat.c \ - os/freebsd/libzfs_ioctl_compat.c \ os/freebsd/libzfs_zmount.c endif if BUILD_LINUX USER_C += \ os/linux/libzfs_mount_os.c \ os/linux/libzfs_pool_os.c \ os/linux/libzfs_sendrecv_os.c \ os/linux/libzfs_util_os.c endif KERNEL_C = \ algs/sha2/sha2.c \ cityhash.c \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ zfs_fletcher.c \ zfs_fletcher_aarch64_neon.c \ zfs_fletcher_avx512.c \ zfs_fletcher_intel.c \ zfs_fletcher_sse.c \ zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ zpool_prop.c \ zprop_common.c dist_libzfs_la_SOURCES = \ $(USER_C) nodist_libzfs_la_SOURCES = \ $(KERNEL_C) libzfs_la_LIBADD = \ $(abs_top_builddir)/lib/libshare/libshare.la \ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libzutil/libzutil.la \ $(abs_top_builddir)/lib/libuutil/libuutil.la libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL) libzfs_la_LDFLAGS = -pthread if !ASAN_ENABLED libzfs_la_LDFLAGS += -Wl,-z,defs endif if BUILD_FREEBSD libzfs_la_LIBADD += -lutil -lgeom endif libzfs_la_LDFLAGS += -version-info 5:0:1 include $(top_srcdir)/config/CppCheck.am # Library ABI EXTRA_DIST = libzfs.abi libzfs.suppr # Licensing data EXTRA_DIST += THIRDPARTYLICENSE.openssl THIRDPARTYLICENSE.openssl.descrip diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index a2c79c8568ca..86d612f5e326 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -1,4175 +1,5950 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + - - - - + + + - - - - + + + - - - - - + + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - + - + - - + + - - + + - - + + - - + + + + + + + + + + + - - - - - - - + + + + + + + + + + - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c index 20251e9e7a5a..fb337ca3f0e6 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c @@ -1,5567 +1,5570 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov * Copyright 2017-2018 RackTop Systems. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright (c) 2021 Matt Fiddaman */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #include #endif /* HAVE_IDMAP */ #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_deleg.h" static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); /* * Given a single type (not a mask of types), return the type in a human * readable form. */ const char * zfs_type_to_name(zfs_type_t type) { switch (type) { case ZFS_TYPE_FILESYSTEM: return (dgettext(TEXT_DOMAIN, "filesystem")); case ZFS_TYPE_SNAPSHOT: return (dgettext(TEXT_DOMAIN, "snapshot")); case ZFS_TYPE_VOLUME: return (dgettext(TEXT_DOMAIN, "volume")); case ZFS_TYPE_POOL: return (dgettext(TEXT_DOMAIN, "pool")); case ZFS_TYPE_BOOKMARK: return (dgettext(TEXT_DOMAIN, "bookmark")); default: assert(!"unhandled zfs_type_t"); } return (NULL); } /* * Validate a ZFS path. This is used even before trying to open the dataset, to * provide a more meaningful error message. We call zfs_error_aux() to * explain exactly why the name was not valid. */ int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, boolean_t modifying) { namecheck_err_t why; char what; if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot delimiter '@' is not expected here")); return (0); } if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '@' delimiter in snapshot name")); return (0); } if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bookmark delimiter '#' is not expected here")); return (0); } if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '#' delimiter in bookmark name")); return (0); } if (modifying && strchr(path, '%') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character %c in name"), '%'); return (0); } if (entity_namecheck(path, &why, &what) != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component or misplaced '@'" " or '#' delimiter in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in name"), what); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool doesn't begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "reserved disk name")); break; case NAME_ERR_SELF_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "self reference, '.' is found in name")); break; case NAME_ERR_PARENT_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent reference, '..' is found in name")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (0); } return (-1); } int zfs_name_valid(const char *name, zfs_type_t type) { if (type == ZFS_TYPE_POOL) return (zpool_name_valid(NULL, B_FALSE, name)); return (zfs_validate_name(NULL, name, type, B_FALSE)); } /* * This function takes the raw DSL properties, and filters out the user-defined * properties into a separate nvlist. */ static nvlist_t * process_user_props(zfs_handle_t *zhp, nvlist_t *props) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvpair_t *elem; nvlist_t *propval; nvlist_t *nvl; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { if (!zfs_prop_user(nvpair_name(elem))) continue; verify(nvpair_value_nvlist(elem, &propval) == 0); if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { nvlist_free(nvl); (void) no_memory(hdl); return (NULL); } } return (nvl); } static zpool_handle_t * zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph; if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { if (hdl->libzfs_pool_handles != NULL) zph->zpool_next = hdl->libzfs_pool_handles; hdl->libzfs_pool_handles = zph; } return (zph); } static zpool_handle_t * zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph = hdl->libzfs_pool_handles; while ((zph != NULL) && (strncmp(pool_name, zpool_get_name(zph), len) != 0)) zph = zph->zpool_next; return (zph); } /* * Returns a handle to the pool that contains the provided dataset. * If a handle to that pool already exists then that handle is returned. * Otherwise, a new handle is created and added to the list of handles. */ static zpool_handle_t * zpool_handle(zfs_handle_t *zhp) { char *pool_name; int len; zpool_handle_t *zph; len = strcspn(zhp->zfs_name, "/@#") + 1; pool_name = zfs_alloc(zhp->zfs_hdl, len); (void) strlcpy(pool_name, zhp->zfs_name, len); zph = zpool_find_handle(zhp, pool_name, len); if (zph == NULL) zph = zpool_add_handle(zhp, pool_name); free(pool_name); return (zph); } void zpool_free_handles(libzfs_handle_t *hdl) { zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; while (zph != NULL) { next = zph->zpool_next; zpool_close(zph); zph = next; } hdl->libzfs_pool_handles = NULL; } /* * Utility function to gather stats (objset and zpl) for the given object. */ static int get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, zc) != 0) { return (-1); } } else { return (-1); } } return (0); } /* * Utility function to get the received properties of the given object. */ static int get_recvd_props_ioctl(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *recvdprops; zfs_cmd_t zc = {"\0"}; int err; if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) return (-1); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { return (-1); } } else { zcmd_free_nvlists(&zc); return (-1); } } err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); zcmd_free_nvlists(&zc); if (err != 0) return (-1); nvlist_free(zhp->zfs_recvd_props); zhp->zfs_recvd_props = recvdprops; return (0); } static int put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) { nvlist_t *allprops, *userprops; zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { return (-1); } /* * XXX Why do we store the user props separately, in addition to * storing them in zfs_props? */ if ((userprops = process_user_props(zhp, allprops)) == NULL) { nvlist_free(allprops); return (-1); } nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); zhp->zfs_props = allprops; zhp->zfs_user_props = userprops; return (0); } static int get_stats(zfs_handle_t *zhp) { int rc = 0; zfs_cmd_t zc = {"\0"}; if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) return (-1); if (get_stats_ioctl(zhp, &zc) != 0) rc = -1; else if (put_stats_zhdl(zhp, &zc) != 0) rc = -1; zcmd_free_nvlists(&zc); return (rc); } /* * Refresh the properties currently stored in the handle. */ void zfs_refresh_properties(zfs_handle_t *zhp) { (void) get_stats(zhp); } /* * Makes a handle from the given dataset name. Used by zfs_open() and * zfs_iter_* to create child handles on the fly. */ static int make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) { if (put_stats_zhdl(zhp, zc) != 0) return (-1); /* * We've managed to open the dataset and gather statistics. Determine * the high-level type. */ if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_head_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) return (-1); else abort(); if (zhp->zfs_dmustats.dds_is_snapshot) zhp->zfs_type = ZFS_TYPE_SNAPSHOT; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_type = ZFS_TYPE_FILESYSTEM; else abort(); /* we should never see any other types */ if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) return (-1); return (0); } zfs_handle_t * make_dataset_handle(libzfs_handle_t *hdl, const char *path) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { free(zhp); return (NULL); } if (get_stats_ioctl(zhp, &zc) == -1) { zcmd_free_nvlists(&zc); free(zhp); return (NULL); } if (make_dataset_handle_common(zhp, &zc) == -1) { free(zhp); zhp = NULL; } zcmd_free_nvlists(&zc); return (zhp); } zfs_handle_t * make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); if (make_dataset_handle_common(zhp, zc) == -1) { free(zhp); return (NULL); } return (zhp); } zfs_handle_t * make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = pzhp->zfs_hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); zhp->zfs_head_type = pzhp->zfs_type; zhp->zfs_type = ZFS_TYPE_SNAPSHOT; zhp->zpool_hdl = zpool_handle(zhp); return (zhp); } zfs_handle_t * zfs_handle_dup(zfs_handle_t *zhp_orig) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = zhp_orig->zfs_hdl; zhp->zpool_hdl = zhp_orig->zpool_hdl; (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, sizeof (zhp->zfs_name)); zhp->zfs_type = zhp_orig->zfs_type; zhp->zfs_head_type = zhp_orig->zfs_head_type; zhp->zfs_dmustats = zhp_orig->zfs_dmustats; if (zhp_orig->zfs_props != NULL) { if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_user_props != NULL) { if (nvlist_dup(zhp_orig->zfs_user_props, &zhp->zfs_user_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_recvd_props != NULL) { if (nvlist_dup(zhp_orig->zfs_recvd_props, &zhp->zfs_recvd_props, 0)) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; if (zhp_orig->zfs_mntopts != NULL) { zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, zhp_orig->zfs_mntopts); } zhp->zfs_props_table = zhp_orig->zfs_props_table; return (zhp); } boolean_t zfs_bookmark_exists(const char *path) { nvlist_t *bmarks; nvlist_t *props; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmark_name; char *pound; int err; boolean_t rv; (void) strlcpy(fsname, path, sizeof (fsname)); pound = strchr(fsname, '#'); if (pound == NULL) return (B_FALSE); *pound = '\0'; bmark_name = pound + 1; props = fnvlist_alloc(); err = lzc_get_bookmarks(fsname, props, &bmarks); nvlist_free(props); if (err != 0) { nvlist_free(bmarks); return (B_FALSE); } rv = nvlist_exists(bmarks, bmark_name); nvlist_free(bmarks); return (rv); } zfs_handle_t * make_bookmark_handle(zfs_handle_t *parent, const char *path, nvlist_t *bmark_props) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); /* Fill in the name. */ zhp->zfs_hdl = parent->zfs_hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); /* Set the property lists. */ if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { free(zhp); return (NULL); } /* Set the types. */ zhp->zfs_head_type = parent->zfs_head_type; zhp->zfs_type = ZFS_TYPE_BOOKMARK; if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { nvlist_free(zhp->zfs_props); free(zhp); return (NULL); } return (zhp); } struct zfs_open_bookmarks_cb_data { const char *path; zfs_handle_t *zhp; }; static int zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data) { struct zfs_open_bookmarks_cb_data *dp = data; /* * Is it the one we are looking for? */ if (strcmp(dp->path, zfs_get_name(zhp)) == 0) { /* * We found it. Save it and let the caller know we are done. */ dp->zhp = zhp; return (EEXIST); } /* * Not found. Close the handle and ask for another one. */ zfs_close(zhp); return (0); } /* * Opens the given snapshot, bookmark, filesystem, or volume. The 'types' * argument is a mask of acceptable types. The function will print an * appropriate error message and return NULL if it can't be opened. */ zfs_handle_t * zfs_open(libzfs_handle_t *hdl, const char *path, int types) { zfs_handle_t *zhp; char errbuf[1024]; char *bookp; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); /* * Validate the name before we even try to open it. */ if (!zfs_validate_name(hdl, path, types, B_FALSE)) { (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); return (NULL); } /* * Bookmarks needs to be handled separately. */ bookp = strchr(path, '#'); if (bookp == NULL) { /* * Try to get stats for the dataset, which will tell us if it * exists. */ errno = 0; if ((zhp = make_dataset_handle(hdl, path)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } } else { char dsname[ZFS_MAX_DATASET_NAME_LEN]; zfs_handle_t *pzhp; struct zfs_open_bookmarks_cb_data cb_data = {path, NULL}; /* * We need to cut out '#' and everything after '#' * to get the parent dataset name only. */ assert(bookp - path < sizeof (dsname)); (void) strncpy(dsname, path, bookp - path); dsname[bookp - path] = '\0'; /* * Create handle for the parent dataset. */ errno = 0; if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } /* * Iterate bookmarks to find the right one. */ errno = 0; if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb, &cb_data) == 0) && (cb_data.zhp == NULL)) { (void) zfs_error(hdl, EZFS_NOENT, errbuf); zfs_close(pzhp); return (NULL); } if (cb_data.zhp == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); zfs_close(pzhp); return (NULL); } zhp = cb_data.zhp; /* * Cleanup. */ zfs_close(pzhp); } if (!(types & zhp->zfs_type)) { (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (NULL); } return (zhp); } /* * Release a ZFS handle. Nothing to do but free the associated memory. */ void zfs_close(zfs_handle_t *zhp) { if (zhp->zfs_mntopts) free(zhp->zfs_mntopts); nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); nvlist_free(zhp->zfs_recvd_props); free(zhp); } typedef struct mnttab_node { struct mnttab mtn_mt; avl_node_t mtn_node; } mnttab_node_t; static int libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) { const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1; const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2; int rv; rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); return (TREE_ISIGN(rv)); } void libzfs_mnttab_init(libzfs_handle_t *hdl) { pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); } static int libzfs_mnttab_update(libzfs_handle_t *hdl) { FILE *mnttab; struct mnttab entry; if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); while (getmntent(mnttab, &entry) == 0) { mnttab_node_t *mtn; avl_index_t where; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); /* Exclude duplicate mounts */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); continue; } avl_add(&hdl->libzfs_mnttab_cache, mtn); } (void) fclose(mnttab); return (0); } void libzfs_mnttab_fini(libzfs_handle_t *hdl) { void *cookie = NULL; mnttab_node_t *mtn; while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } avl_destroy(&hdl->libzfs_mnttab_cache); (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) { hdl->libzfs_mnttab_enable = enable; } int libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, struct mnttab *entry) { FILE *mnttab; mnttab_node_t find; mnttab_node_t *mtn; int ret = ENOENT; if (!hdl->libzfs_mnttab_enable) { struct mnttab srch = { 0 }; if (avl_numnodes(&hdl->libzfs_mnttab_cache)) libzfs_mnttab_fini(hdl); if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); srch.mnt_special = (char *)fsname; srch.mnt_fstype = MNTTYPE_ZFS; ret = getmntany(mnttab, entry, &srch) ? ENOENT : 0; (void) fclose(mnttab); return (ret); } pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) { int error; if ((error = libzfs_mnttab_update(hdl)) != 0) { pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (error); } } find.mtn_mt.mnt_special = (char *)fsname; mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); if (mtn) { *entry = mtn->mtn_mt; ret = 0; } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (ret); } void libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, const char *mountp, const char *mntopts) { mnttab_node_t *mtn; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) { mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); /* * Another thread may have already added this entry * via libzfs_mnttab_update. If so we should skip it. */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } else { avl_add(&hdl->libzfs_mnttab_cache, mtn); } } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) { mnttab_node_t find; mnttab_node_t *ret; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); find.mtn_mt.mnt_special = (char *)fsname; if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) != NULL) { avl_remove(&hdl->libzfs_mnttab_cache, ret); free(ret->mtn_mt.mnt_special); free(ret->mtn_mt.mnt_mountp); free(ret->mtn_mt.mnt_fstype); free(ret->mtn_mt.mnt_mntopts); free(ret); } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } int zfs_spa_version(zfs_handle_t *zhp, int *spa_version) { zpool_handle_t *zpool_handle = zhp->zpool_hdl; if (zpool_handle == NULL) return (-1); *spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); return (0); } /* * The choice of reservation property depends on the SPA version. */ static int zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) { int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); if (spa_version >= SPA_VERSION_REFRESERVATION) *resv_prop = ZFS_PROP_REFRESERVATION; else *resv_prop = ZFS_PROP_RESERVATION; return (0); } /* * Given an nvlist of properties to set, validates that they are correct, and * parses any numeric properties (index, boolean, etc) if they are specified as * strings. */ nvlist_t * zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, boolean_t key_params_ok, const char *errbuf) { nvpair_t *elem; uint64_t intval; char *strval; zfs_prop_t prop; nvlist_t *ret; int chosen_normal = -1; int chosen_utf = -1; if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } /* * Make sure this property is valid and applies to this type. */ elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zfs_name_to_prop(propname); if (prop == ZPROP_INVAL && zfs_prop_user(propname)) { /* * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property name '%s' is too long"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (nvlist_add_string(ret, propname, strval) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Currently, only user properties can be modified on * snapshots. */ if (type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "this property can not be modified for snapshots")); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) { zfs_userquota_prop_t uqtype; char *newpropname = NULL; char domain[128]; uint64_t rid; uint64_t valary[3]; int rc; if (userquota_propname_decode(propname, zoned, &uqtype, domain, sizeof (domain), &rid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' has an invalid user/group name"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (uqtype != ZFS_PROP_USERQUOTA && uqtype != ZFS_PROP_GROUPQUOTA && uqtype != ZFS_PROP_USEROBJQUOTA && uqtype != ZFS_PROP_GROUPOBJQUOTA && uqtype != ZFS_PROP_PROJECTQUOTA && uqtype != ZFS_PROP_PROJECTOBJQUOTA) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (nvpair_type(elem) == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &strval); if (strcmp(strval, "none") == 0) { intval = 0; } else if (zfs_nicestrtonum(hdl, strval, &intval) != 0) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, &intval); if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable " "{user|group|project}quota")); goto error; } } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * Encode the prop name as * userquota@-domain, to make it easy * for the kernel to decode. */ rc = asprintf(&newpropname, "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], (longlong_t)rid, domain); if (rc == -1 || newpropname == NULL) { (void) no_memory(hdl); goto error; } valary[0] = uqtype; valary[1] = rid; valary[2] = intval; if (nvlist_add_uint64_array(ret, newpropname, valary, 3) != 0) { free(newpropname); (void) no_memory(hdl); goto error; } free(newpropname); continue; } else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!zfs_prop_valid_for_type(prop, type, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' does not " "apply to datasets of this type"), propname); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (zfs_prop_readonly(prop) && !(zfs_prop_setonce(prop) && zhp == NULL) && !(zfs_prop_encryption_key_param(prop) && key_params_ok)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, type, ret, &strval, &intval, errbuf) != 0) goto error; /* * Perform some additional checks for specific properties. */ switch (prop) { case ZFS_PROP_VERSION: { int version; if (zhp == NULL) break; version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); if (intval < version) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Can not downgrade; already at version %u"), version); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: { int maxbs = SPA_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); } /* * The value must be a power of two between * SPA_MINBLOCKSIZE and maxbs. */ if (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval)) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be power of 2 from 512B " "to %s"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: { int maxbs = SPA_OLD_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { char state[64] = ""; maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); /* * Issue a warning but do not fail so that * tests for settable properties succeed. */ if (zpool_prop_get_feature(zpool_hdl, "feature@allocation_classes", state, sizeof (state)) != 0 || strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { (void) fprintf(stderr, gettext( "%s: property requires a special " "device in the pool\n"), propname); } } if (intval != 0 && (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval))) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid '%s=%llu' property: must be zero " "or a power of 2 from 512B to %s"), propname, (unsigned long long)intval, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL /* * Verify the mlslabel string and convert to * internal hex label string. */ m_label_t *new_sl; char *hex = NULL; /* internal label string */ /* Default value is already OK. */ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) break; /* Verify the label can be converted to binary form */ if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || (str_to_label(strval, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1)) { goto badlabel; } /* Now translate to hex internal label string */ if (label_to_str(new_sl, &hex, M_INTERNAL, DEF_NAMES) != 0) { if (hex) free(hex); goto badlabel; } m_label_free(new_sl); /* If string is already in internal form, we're done. */ if (strcmp(strval, hex) == 0) { free(hex); break; } /* Replace the label string with the internal form. */ (void) nvlist_remove(ret, zfs_prop_to_name(prop), DATA_TYPE_STRING); verify(nvlist_add_string(ret, zfs_prop_to_name(prop), hex) == 0); free(hex); break; badlabel: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid mlslabel '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); m_label_free(new_sl); /* OK if null */ goto error; #else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "mlslabels are unsupported")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; #endif /* HAVE_MLSLABEL */ } case ZFS_PROP_MOUNTPOINT: { namecheck_err_t why; if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) break; if (mountpoint_namecheck(strval, &why)) { switch (why) { case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be an absolute path, " "'none', or 'legacy'"), propname); break; case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "component of '%s' is too long"), propname); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } fallthrough; } case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* * For the mountpoint and sharenfs or sharesmb * properties, check if it can be set in a * global/non-global zone based on * the zoned property value: * * global zone non-global zone * -------------------------------------------------- * zoned=on mountpoint (no) mountpoint (yes) * sharenfs (no) sharenfs (no) * sharesmb (no) sharesmb (no) * * zoned=off mountpoint (yes) N/A * sharenfs (yes) * sharesmb (yes) */ if (zoned) { if (getzoneid() == GLOBAL_ZONEID) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set on " "dataset in a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } else if (prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set in " "a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } } else if (getzoneid() != GLOBAL_ZONEID) { /* * If zoned property is 'off', this must be in * a global zone. If not, something is wrong. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set while dataset " "'zoned' property is set"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } /* * At this point, it is legitimate to set the * property. Now we want to make sure that the * property value is valid if it is sharenfs. */ if ((prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) && strcmp(strval, "on") != 0 && strcmp(strval, "off") != 0) { zfs_share_proto_t proto; if (prop == ZFS_PROP_SHARESMB) proto = PROTO_SMB; else proto = PROTO_NFS; if (zfs_parse_options(strval, proto) != SA_OK) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set to invalid " "options"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_KEYLOCATION: if (!zfs_prop_valid_keylocation(strval, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid keylocation")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zhp != NULL) { uint64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); if (crypt == ZIO_CRYPT_OFF && strcmp(strval, "none") != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must be 'none' " "for unencrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (crypt != ZIO_CRYPT_OFF && strcmp(strval, "none") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must not be 'none' " "for encrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_PBKDF2_ITERS: if (intval < MIN_PBKDF2_ITERATIONS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "minimum pbkdf2 iterations is %u"), MIN_PBKDF2_ITERATIONS); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; case ZFS_PROP_NORMALIZE: chosen_normal = (int)intval; break; default: break; } /* * For changes to existing volumes, we have some additional * checks to enforce. */ if (type == ZFS_TYPE_VOLUME && zhp != NULL) { uint64_t blocksize = zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE); char buf[64]; switch (prop) { case ZFS_PROP_VOLSIZE: if (intval % blocksize != 0) { zfs_nicebytes(blocksize, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a multiple of " "volume block size (%s)"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be zero"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } /* check encryption properties */ if (zhp != NULL) { int64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); switch (prop) { case ZFS_PROP_COPIES: if (crypt != ZIO_CRYPT_OFF && intval > 2) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encrypted datasets cannot have " "3 copies")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } } /* * If normalization was chosen, but no UTF8 choice was made, * enforce rejection of non-UTF8 names. * * If normalization was chosen, but rejecting non-UTF8 names * was explicitly not chosen, it is an error. */ if (chosen_normal > 0 && chosen_utf < 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { (void) no_memory(hdl); goto error; } } else if (chosen_normal > 0 && chosen_utf == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be set 'on' if normalization chosen"), zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } return (ret); error: nvlist_free(ret); return (NULL); } static int zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t old_volsize; uint64_t new_volsize; uint64_t old_reservation; uint64_t new_reservation; zfs_prop_t resv_prop; nvlist_t *props; zpool_handle_t *zph = zpool_handle(zhp); /* * If this is an existing volume, and someone is setting the volsize, * make sure that it matches the reservation, or add it if necessary. */ old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_reservation = zfs_prop_get_int(zhp, resv_prop); props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if ((zvol_volsize_to_reservation(zph, old_volsize, props) != old_reservation) || nvlist_exists(nvl, zfs_prop_to_name(resv_prop))) { fnvlist_free(props); return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &new_volsize) != 0) { fnvlist_free(props); return (-1); } new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); fnvlist_free(props); if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), new_reservation) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } /* * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value. * Return codes must match zfs_add_synthetic_resv(). */ static int zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t volsize; uint64_t resvsize; zfs_prop_t prop; nvlist_t *props; if (!ZFS_IS_VOLUME(zhp)) { return (0); } if (zfs_which_resv_prop(zhp, &prop) != 0) { return (-1); } if (prop != ZFS_PROP_REFRESERVATION) { return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) { /* No value being set, so it can't be "auto" */ return (0); } if (resvsize != UINT64_MAX) { /* Being set to a value other than "auto" */ return (0); } props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) { volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); } resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, props); fnvlist_free(props); (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * Given a property name and value, set the property for the given dataset. */ int zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) { int ret = -1; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_string(nvl, propname, propval) != 0) { (void) no_memory(hdl); goto error; } ret = zfs_prop_set_list(zhp, nvl); error: nvlist_free(nvl); return (ret); } /* * Given an nvlist of property names and values, set the properties for the * given dataset. */ int zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) { zfs_cmd_t zc = {"\0"}; int ret = -1; prop_changelist_t **cls = NULL; int cl_idx; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; zfs_prop_t prop = 0; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, B_FALSE, errbuf)) == NULL) goto error; /* * We have to check for any extra properties which need to be added * before computing the length of the nvlist. */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { goto error; } } if (added_resv != 1 && (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) { goto error; } /* * Check how many properties we're setting and allocate an array to * store changelist pointers for postfix(). */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) nvl_len++; if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) goto error; cl_idx = 0; for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); assert(cl_idx < nvl_len); /* * We don't want to unmount & remount the dataset when changing * its canmount property to 'on' or 'noauto'. We only use * the changelist logic to unmount when setting canmount=off. */ if (prop != ZFS_PROP_CANMOUNT || (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && zfs_is_mounted(zhp, NULL))) { cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); if (cls[cl_idx] == NULL) goto error; } if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cls[cl_idx])) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if (cls[cl_idx] != NULL && (ret = changelist_prefix(cls[cl_idx])) != 0) goto error; cl_idx++; } assert(cl_idx == nvl_len); /* * Execute the corresponding ioctl() to set this list of properties. */ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 || (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0) goto error; ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); if (ret != 0) { if (zc.zc_nvlist_dst_filled == B_FALSE) { (void) zfs_standard_error(hdl, errno, errbuf); goto error; } /* Get the list of unset properties back and report them. */ nvlist_t *errorprops = NULL; if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) goto error; for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL); elem != NULL; elem = nvlist_next_nvpair(errorprops, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); zfs_setprop_error(hdl, prop, errno, errbuf); } nvlist_free(errorprops); if (added_resv && errno == ENOSPC) { /* clean up the volsize property we tried to set */ uint64_t old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); nvlist_free(nvl); nvl = NULL; zcmd_free_nvlists(&zc); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) goto error; if (nvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), old_volsize) != 0) goto error; if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) goto error; (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); } } else { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) { int clp_err = changelist_postfix(cls[cl_idx]); if (clp_err != 0) ret = clp_err; } } if (ret == 0) { /* * Refresh the statistics so the new property * value is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } error: nvlist_free(nvl); zcmd_free_nvlists(&zc); if (cls != NULL) { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) changelist_free(cls[cl_idx]); } free(cls); } return (ret); } /* * Given a property, inherit the value from the parent dataset, or if received * is TRUE, revert to the received value, if any. */ int zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) { zfs_cmd_t zc = {"\0"}; int ret; prop_changelist_t *cl; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; zfs_prop_t prop; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s'"), propname, zhp->zfs_name); zc.zc_cookie = received; if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { /* * For user properties, the amount of work we have to do is very * small, so just do it here. */ if (!zfs_prop_user(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) return (zfs_standard_error(hdl, errno, errbuf)); (void) get_stats(zhp); return (0); } /* * Verify that this property is inheritable. */ if (zfs_prop_readonly(prop)) return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); if (!zfs_prop_inheritable(prop) && !received) return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); /* * Check to see if the value applies to this type */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); /* * Normalize the name, to get rid of shorthand abbreviations. */ propname = zfs_prop_to_name(prop); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Determine datasets which will be affected by this change, if any. */ if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) return (-1); if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) { return (zfs_standard_error(hdl, errno, errbuf)); } else { if ((ret = changelist_postfix(cl)) != 0) goto error; /* * Refresh the statistics so the new property is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } error: changelist_free(cl); return (ret); } /* * True DSL properties are stored in an nvlist. The following two functions * extract them appropriately. */ uint64_t getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; uint64_t value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_numeric(prop); *source = ""; } return (value); } static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_string(prop); *source = ""; } return (value); } static boolean_t zfs_is_recvd_props_mode(zfs_handle_t *zhp) { return (zhp->zfs_props == zhp->zfs_recvd_props); } static void zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; zhp->zfs_props = zhp->zfs_recvd_props; } static void zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; *cookie = 0; } /* * Internal function for getting a numeric property. Both zfs_prop_get() and * zfs_prop_get_int() are built using this interface. * * Certain properties can be overridden using 'mount -o'. In this case, scan * the contents of the /proc/self/mounts entry, searching for the * appropriate options. If they differ from the on-disk values, report the * current values and mark the source "temporary". */ static int get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, char **source, uint64_t *val) { zfs_cmd_t zc = {"\0"}; nvlist_t *zplprops = NULL; struct mnttab mnt; char *mntopt_on = NULL; char *mntopt_off = NULL; boolean_t received = zfs_is_recvd_props_mode(zhp); *source = NULL; /* * If the property is being fetched for a snapshot, check whether * the property is valid for the snapshot's head dataset type. */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT && !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { *val = zfs_prop_default_numeric(prop); return (-1); } switch (prop) { case ZFS_PROP_ATIME: mntopt_on = MNTOPT_ATIME; mntopt_off = MNTOPT_NOATIME; break; case ZFS_PROP_RELATIME: mntopt_on = MNTOPT_RELATIME; mntopt_off = MNTOPT_NORELATIME; break; case ZFS_PROP_DEVICES: mntopt_on = MNTOPT_DEVICES; mntopt_off = MNTOPT_NODEVICES; break; case ZFS_PROP_EXEC: mntopt_on = MNTOPT_EXEC; mntopt_off = MNTOPT_NOEXEC; break; case ZFS_PROP_READONLY: mntopt_on = MNTOPT_RO; mntopt_off = MNTOPT_RW; break; case ZFS_PROP_SETUID: mntopt_on = MNTOPT_SETUID; mntopt_off = MNTOPT_NOSETUID; break; case ZFS_PROP_XATTR: mntopt_on = MNTOPT_XATTR; mntopt_off = MNTOPT_NOXATTR; break; case ZFS_PROP_NBMAND: mntopt_on = MNTOPT_NBMAND; mntopt_off = MNTOPT_NONBMAND; break; default: break; } /* * Because looking up the mount options is potentially expensive * (iterating over all of /proc/self/mounts), we defer its * calculation until we're looking up a property which requires * its presence. */ if (!zhp->zfs_mntcheck && (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) { zhp->zfs_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); if (zhp->zfs_mntopts == NULL) return (-1); } zhp->zfs_mntcheck = B_TRUE; } if (zhp->zfs_mntopts == NULL) mnt.mnt_mntopts = ""; else mnt.mnt_mntopts = zhp->zfs_mntopts; switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_READONLY: case ZFS_PROP_SETUID: #ifndef __FreeBSD__ case ZFS_PROP_XATTR: #endif case ZFS_PROP_NBMAND: *val = getprop_uint64(zhp, prop, source); if (received) break; if (hasmntopt(&mnt, mntopt_on) && !*val) { *val = B_TRUE; if (src) *src = ZPROP_SRC_TEMPORARY; } else if (hasmntopt(&mnt, mntopt_off) && *val) { *val = B_FALSE; if (src) *src = ZPROP_SRC_TEMPORARY; } break; case ZFS_PROP_CANMOUNT: case ZFS_PROP_VOLSIZE: case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { /* not default, must be local */ *source = zhp->zfs_name; } break; case ZFS_PROP_MOUNTED: *val = (zhp->zfs_mntopts != NULL); break; case ZFS_PROP_NUMCLONES: *val = zhp->zfs_dmustats.dds_num_clones; break; case ZFS_PROP_VERSION: case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: case ZFS_PROP_CASE: if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) return (-1); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { zcmd_free_nvlists(&zc); if (prop == ZFS_PROP_VERSION && zhp->zfs_type == ZFS_TYPE_VOLUME) *val = zfs_prop_default_numeric(prop); return (-1); } if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), val) != 0) { zcmd_free_nvlists(&zc); return (-1); } nvlist_free(zplprops); zcmd_free_nvlists(&zc); break; case ZFS_PROP_INCONSISTENT: *val = zhp->zfs_dmustats.dds_inconsistent; break; case ZFS_PROP_REDACTED: *val = zhp->zfs_dmustats.dds_redacted; break; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); /* * If we tried to use a default value for a * readonly property, it means that it was not * present. Note this only applies to "truly" * readonly properties, not set-once properties * like volblocksize. */ if (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop) && *source != NULL && (*source)[0] == '\0') { *source = NULL; return (-1); } break; case PROP_TYPE_STRING: default: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "cannot get non-numeric property")); return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "internal error"))); } } return (0); } /* * Calculate the source type, given the raw source string. */ static void get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, char *statbuf, size_t statlen) { if (statbuf == NULL || srctype == NULL || *srctype == ZPROP_SRC_TEMPORARY) { return; } if (source == NULL) { *srctype = ZPROP_SRC_NONE; } else if (source[0] == '\0') { *srctype = ZPROP_SRC_DEFAULT; } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { *srctype = ZPROP_SRC_RECEIVED; } else { if (strcmp(source, zhp->zfs_name) == 0) { *srctype = ZPROP_SRC_LOCAL; } else { (void) strlcpy(statbuf, source, statlen); *srctype = ZPROP_SRC_INHERITED; } } } int zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, size_t proplen, boolean_t literal) { zfs_prop_t prop; int err = 0; if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (-1); prop = zfs_name_to_prop(propname); if (prop != ZPROP_INVAL) { uint64_t cookie; if (!nvlist_exists(zhp->zfs_recvd_props, propname)) return (-1); zfs_set_recvd_props_mode(zhp, &cookie); err = zfs_prop_get(zhp, prop, propbuf, proplen, NULL, NULL, 0, literal); zfs_unset_recvd_props_mode(zhp, &cookie); } else { nvlist_t *propval; char *recvdval; if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, propname, &propval) != 0) return (-1); verify(nvlist_lookup_string(propval, ZPROP_VALUE, &recvdval) == 0); (void) strlcpy(propbuf, recvdval, proplen); } return (err == 0 ? 0 : -1); } static int get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; nvpair_t *pair; value = zfs_get_clones_nvl(zhp); if (value == NULL || nvlist_empty(value)) return (-1); propbuf[0] = '\0'; for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; pair = nvlist_next_nvpair(value, pair)) { if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) strlcat(propbuf, nvpair_name(pair), proplen); } return (0); } struct get_clones_arg { uint64_t numclones; nvlist_t *value; const char *origin; char buf[ZFS_MAX_DATASET_NAME_LEN]; }; static int get_clones_cb(zfs_handle_t *zhp, void *arg) { struct get_clones_arg *gca = arg; if (gca->numclones == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), NULL, NULL, 0, B_TRUE) != 0) goto out; if (strcmp(gca->buf, gca->origin) == 0) { fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); gca->numclones--; } out: (void) zfs_iter_children(zhp, get_clones_cb, gca); zfs_close(zhp); return (0); } nvlist_t * zfs_get_clones_nvl(zfs_handle_t *zhp) { nvlist_t *nv, *value; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { struct get_clones_arg gca; /* * if this is a snapshot, then the kernel wasn't able * to get the clones. Do it by slowly iterating. */ if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) return (NULL); if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) return (NULL); if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { nvlist_free(nv); return (NULL); } gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); gca.value = value; gca.origin = zhp->zfs_name; if (gca.numclones != 0) { zfs_handle_t *root; char pool[ZFS_MAX_DATASET_NAME_LEN]; char *cp = pool; /* get the pool name */ (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); (void) strsep(&cp, "/@"); root = zfs_open(zhp->zfs_hdl, pool, ZFS_TYPE_FILESYSTEM); if (root == NULL) { nvlist_free(nv); nvlist_free(value); return (NULL); } (void) get_clones_cb(root, &gca); } if (gca.numclones != 0 || nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || nvlist_add_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { nvlist_free(nv); nvlist_free(value); return (NULL); } nvlist_free(nv); nvlist_free(value); verify(0 == nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), &nv)); } verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0); return (value); } static int get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; uint64_t *snaps; uint_t nsnaps; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) return (-1); if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, &nsnaps) != 0) return (-1); if (nsnaps == 0) { /* There's no redaction snapshots; pass a special value back */ (void) snprintf(propbuf, proplen, "none"); return (0); } propbuf[0] = '\0'; for (int i = 0; i < nsnaps; i++) { char buf[128]; if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)snaps[i]); (void) strlcat(propbuf, buf, proplen); } return (0); } /* * Accepts a property and value and checks that the value * matches the one found by the channel program. If they are * not equal, print both of them. */ static void zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval, const char *strval) { if (!zhp->zfs_hdl->libzfs_prop_debug) return; int error; char *poolname = zhp->zpool_hdl->zpool_name; const char *prop_name = zfs_prop_to_name(prop); const char *program = "args = ...\n" "ds = args['dataset']\n" "prop = args['property']\n" "value, setpoint = zfs.get_prop(ds, prop)\n" "return {value=value, setpoint=setpoint}\n"; nvlist_t *outnvl; nvlist_t *retnvl; nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string(argnvl, "dataset", zhp->zfs_name); fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop)); error = lzc_channel_program_nosync(poolname, program, 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl); if (error == 0) { retnvl = fnvlist_lookup_nvlist(outnvl, "return"); if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) { int64_t ans; error = nvlist_lookup_int64(retnvl, "value", &ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (ans != intval) { (void) fprintf(stderr, "%s: zfs found %llu, " "but zcp found %llu\n", prop_name, (u_longlong_t)intval, (u_longlong_t)ans); } } else { char *str_ans; error = nvlist_lookup_string(retnvl, "value", &str_ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (strcmp(strval, str_ans) != 0) { (void) fprintf(stderr, "%s: zfs found '%s', but zcp found '%s'\n", prop_name, strval, str_ans); } } } else { (void) fprintf(stderr, "%s: zcp check failed, channel program " "error: %u\n", prop_name, error); } nvlist_free(argnvl); nvlist_free(outnvl); } /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a * human-readable form. * * Returns 0 on success, or -1 on error. */ int zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) { char *source = NULL; uint64_t val; const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (-1); if (received && zfs_prop_readonly(prop)) return (-1); if (src) *src = ZPROP_SRC_NONE; switch (prop) { case ZFS_PROP_CREATION: /* * 'creation' is a time_t stored in the statistics. We convert * this into a string unless 'literal' is specified. */ { val = getprop_uint64(zhp, prop, &source); time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_MOUNTPOINT: /* * Getting the precise mountpoint can be tricky. * * - for 'none' or 'legacy', return those values. * - for inherited mountpoints, we want to take everything * after our ancestor and append it to the inherited value. * * If the pool has an alternate root, we want to prepend that * root to any values we return. */ str = getprop_string(zhp, prop, &source); if (str[0] == '/') { char buf[MAXPATHLEN]; char *root = buf; const char *relpath; /* * If we inherit the mountpoint, even from a dataset * with a received value, the source will be the path of * the dataset we inherit from. If source is * ZPROP_SOURCE_VAL_RECVD, the received value is not * inherited. */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { relpath = ""; } else { relpath = zhp->zfs_name + strlen(source); if (relpath[0] == '/') relpath++; } if ((zpool_get_prop(zhp->zpool_hdl, ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, B_FALSE)) || (strcmp(root, "-") == 0)) root[0] = '\0'; /* * Special case an alternate root of '/'. This will * avoid having multiple leading slashes in the * mountpoint path. */ if (strcmp(root, "/") == 0) root++; /* * If the mountpoint is '/' then skip over this * if we are obtaining either an alternate root or * an inherited mountpoint. */ if (str[1] == '\0' && (root[0] != '\0' || relpath[0] != '\0')) str++; if (relpath[0] == '\0') (void) snprintf(propbuf, proplen, "%s%s", root, str); else (void) snprintf(propbuf, proplen, "%s%s%s%s", root, str, relpath[0] == '@' ? "" : "/", relpath); } else { /* 'legacy' or 'none' */ (void) strlcpy(propbuf, str, proplen); } zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_ORIGIN: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case ZFS_PROP_REDACT_SNAPS: if (get_rsnaps_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_CLONES: if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If quota or reservation is 0, we translate this into 'none' * (unless literal is set), and indicate that it's the default * value. Otherwise, we print the number nicely and indicate * that its set locally. */ if (val == 0) { if (literal) (void) strlcpy(propbuf, "0", proplen); else (void) strlcpy(propbuf, "none", proplen); } else { if (literal) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); else zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If limit is UINT64_MAX, we translate this into 'none' (unless * literal is set), and indicate that it's the default value. * Otherwise, we print the number nicely and indicate that it's * set locally. */ if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else if (val == UINT64_MAX) { (void) strlcpy(propbuf, "none", proplen); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) (void) snprintf(propbuf, proplen, "%llu.%02llu", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); else (void) snprintf(propbuf, proplen, "%llu.%02llux", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_TYPE: switch (zhp->zfs_type) { case ZFS_TYPE_FILESYSTEM: str = "filesystem"; break; case ZFS_TYPE_VOLUME: str = "volume"; break; case ZFS_TYPE_SNAPSHOT: str = "snapshot"; break; case ZFS_TYPE_BOOKMARK: str = "bookmark"; break; default: abort(); } (void) snprintf(propbuf, proplen, "%s", str); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MOUNTED: /* * The 'mounted' property is a pseudo-property that described * whether the filesystem is currently mounted. Even though * it's a boolean value, the typical values of "on" and "off" * don't make sense, so we translate to "yes" and "no". */ if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, src, &source, &val) != 0) return (-1); if (val) (void) strlcpy(propbuf, "yes", proplen); else (void) strlcpy(propbuf, "no", proplen); break; case ZFS_PROP_NAME: /* * The 'name' property is a pseudo-property derived from the * dataset name. It is presented as a real property to simplify * consumers. */ (void) strlcpy(propbuf, zhp->zfs_name, proplen); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL m_label_t *new_sl = NULL; char *ascii = NULL; /* human readable label */ (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); if (literal || (strcasecmp(propbuf, ZFS_MLSLABEL_DEFAULT) == 0)) break; /* * Try to translate the internal hex string to * human-readable output. If there are any * problems just use the hex string. */ if (str_to_label(propbuf, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1) { m_label_free(new_sl); break; } if (label_to_str(new_sl, &ascii, M_LABEL, DEF_NAMES) != 0) { if (ascii) free(ascii); m_label_free(new_sl); break; } m_label_free(new_sl); (void) strlcpy(propbuf, ascii, proplen); free(ascii); #else (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); #endif /* HAVE_MLSLABEL */ } break; case ZFS_PROP_GUID: case ZFS_PROP_CREATETXG: case ZFS_PROP_OBJSETID: case ZFS_PROP_PBKDF2_ITERS: /* * These properties are stored as numbers, but they are * identifiers or counters. * We don't want them to be pretty printed, because pretty * printing truncates their values making them useless. */ if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFERENCED: case ZFS_PROP_AVAILABLE: case ZFS_PROP_USED: case ZFS_PROP_USEDSNAP: case ZFS_PROP_USEDDS: case ZFS_PROP_USEDREFRESERV: case ZFS_PROP_USEDCHILD: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) { return (-1); } if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case PROP_TYPE_STRING: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case PROP_TYPE_INDEX: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (zfs_prop_index_to_string(prop, val, &strval) != 0) return (-1); (void) strlcpy(propbuf, strval, proplen); zcp_check(zhp, prop, 0, strval); break; default: abort(); } } get_source(zhp, src, source, statbuf, statlen); return (0); } /* * Utility function to get the given numeric property. Does no validation that * the given property is the appropriate type; should only be used with * hard-coded property types. */ uint64_t zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) { char *source; uint64_t val = 0; (void) get_numeric_property(zhp, prop, NULL, &source, &val); return (val); } static int zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) { char buf[64]; (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); } /* * Similar to zfs_prop_get(), but returns the value as an integer. */ int zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, zprop_source_t *src, char *statbuf, size_t statlen) { char *source; /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) { return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, dgettext(TEXT_DOMAIN, "cannot get property '%s'"), zfs_prop_to_name(prop))); } if (src) *src = ZPROP_SRC_NONE; if (get_numeric_property(zhp, prop, src, &source, value) != 0) return (-1); get_source(zhp, src, source, statbuf, statlen); return (0); } #ifdef HAVE_IDMAP static int idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, char **domainp, idmap_rid_t *ridp) { idmap_get_handle_t *get_hdl = NULL; idmap_stat status; int err = EINVAL; if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) goto out; if (isuser) { err = idmap_get_sidbyuid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } else { err = idmap_get_sidbygid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } if (err == IDMAP_SUCCESS && idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && status == IDMAP_SUCCESS) err = 0; else err = EINVAL; out: if (get_hdl) idmap_get_destroy(get_hdl); return (err); } #endif /* HAVE_IDMAP */ /* * convert the propname into parameters needed by kernel * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 * Eg: groupquota@staff -> ZFS_PROP_GROUPQUOTA, "", 1234 * Eg: groupused@staff -> ZFS_PROP_GROUPUSED, "", 1234 * Eg: projectquota@123 -> ZFS_PROP_PROJECTQUOTA, "", 123 * Eg: projectused@789 -> ZFS_PROP_PROJECTUSED, "", 789 */ static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) { zfs_userquota_prop_t type; char *cp; boolean_t isuser; boolean_t isgroup; boolean_t isproject; struct passwd *pw; struct group *gr; domain[0] = '\0'; /* Figure out the property type ({user|group|project}{quota|space}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(propname, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } if (type == ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *typep = type; isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_USEROBJUSED); isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_GROUPOBJUSED); isproject = (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED); cp = strchr(propname, '@') + 1; if (isuser && (pw = getpwnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = pw->pw_uid; } else if (isgroup && (gr = getgrnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = gr->gr_gid; } else if (!isproject && strchr(cp, '@')) { #ifdef HAVE_IDMAP /* * It's a SID name (eg "user@domain") that needs to be * turned into S-1-domainID-RID. */ directory_error_t e; char *numericsid = NULL; char *end; if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); if (isuser) { e = directory_sid_from_user_name(NULL, cp, &numericsid); } else { e = directory_sid_from_group_name(NULL, cp, &numericsid); } if (e != NULL) { directory_error_free(e); return (ENOENT); } if (numericsid == NULL) return (ENOENT); cp = numericsid; (void) strlcpy(domain, cp, domainlen); cp = strrchr(domain, '-'); *cp = '\0'; cp++; errno = 0; *ridp = strtoull(cp, &end, 10); free(numericsid); if (errno != 0 || *end != '\0') return (EINVAL); #else return (ENOSYS); #endif /* HAVE_IDMAP */ } else { /* It's a user/group/project ID (eg "12345"). */ uid_t id; char *end; id = strtoul(cp, &end, 10); if (*end != '\0') return (EINVAL); if (id > MAXUID && !isproject) { #ifdef HAVE_IDMAP /* It's an ephemeral ID. */ idmap_rid_t rid; char *mapdomain; if (idmap_id_to_numeric_domain_rid(id, isuser, &mapdomain, &rid) != 0) return (ENOENT); (void) strlcpy(domain, mapdomain, domainlen); *ridp = rid; #else return (ENOSYS); #endif /* HAVE_IDMAP */ } else { *ridp = id; } } return (0); } static int zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue, zfs_userquota_prop_t *typep) { int err; zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); err = userquota_propname_decode(propname, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); zc.zc_objset_type = *typep; if (err) return (err); err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { zfs_userquota_prop_t type; return (zfs_prop_get_userquota_common(zhp, propname, propvalue, &type)); } int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; zfs_userquota_prop_t type; err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, &type); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else if (propvalue == 0 && (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTOBJQUOTA)) { (void) strlcpy(propbuf, "none", proplen); } else if (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(propvalue, propbuf, proplen); } else { zfs_nicenum(propvalue, propbuf, proplen); } return (0); } /* * propname must start with "written@" or "written#". */ int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { int err; zfs_cmd_t zc = {"\0"}; const char *snapname; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); assert(zfs_prop_written(propname)); snapname = propname + strlen("written@"); if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { /* full snapshot or bookmark name specified */ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); } else { /* snapname is the short name, append it to zhp's fsname */ char *cp; (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); cp = strchr(zc.zc_value, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); } err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; err = zfs_prop_get_written_int(zhp, propname, &propvalue); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else { zfs_nicebytes(propvalue, propbuf, proplen); } return (0); } /* * Returns the name of the given zfs handle. */ const char * zfs_get_name(const zfs_handle_t *zhp) { return (zhp->zfs_name); } /* * Returns the name of the parent pool for the given zfs handle. */ const char * zfs_get_pool_name(const zfs_handle_t *zhp) { return (zhp->zpool_hdl->zpool_name); } /* * Returns the type of the given zfs handle. */ zfs_type_t zfs_get_type(const zfs_handle_t *zhp) { return (zhp->zfs_type); } /* * Returns the type of the given zfs handle, * or, if a snapshot, the type of the snapshotted dataset. */ zfs_type_t zfs_get_underlying_type(const zfs_handle_t *zhp) { return (zhp->zfs_head_type); } /* * Is one dataset name a child dataset of another? * * Needs to handle these cases: * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" * Descendant? No. No. No. Yes. */ static boolean_t is_descendant(const char *ds1, const char *ds2) { size_t d1len = strlen(ds1); /* ds2 can't be a descendant if it's smaller */ if (strlen(ds2) < d1len) return (B_FALSE); /* otherwise, compare strings and verify that there's a '/' char */ return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); } /* * Given a complete name, return just the portion that refers to the parent. * Will return -1 if there is no parent (path is just the name of the * pool). */ static int parent_name(const char *path, char *buf, size_t buflen) { char *slashp; (void) strlcpy(buf, path, buflen); if ((slashp = strrchr(buf, '/')) == NULL) return (-1); *slashp = '\0'; return (0); } int zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen) { return (parent_name(zfs_get_name(zhp), buf, buflen)); } /* * If accept_ancestor is false, then check to make sure that the given path has * a parent, and that it exists. If accept_ancestor is true, then find the * closest existing ancestor for the given path. In prefixlen return the * length of already existing prefix of the given path. We also fetch the * 'zoned' property, which is used to validate property settings when creating * new datasets. */ static int check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, boolean_t accept_ancestor, int *prefixlen) { zfs_cmd_t zc = {"\0"}; char parent[ZFS_MAX_DATASET_NAME_LEN]; char *slash; zfs_handle_t *zhp; char errbuf[1024]; uint64_t is_zoned; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* get parent, and check to see if this is just a pool */ if (parent_name(path, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* check to see if the pool exists */ if ((slash = strchr(parent, '/')) == NULL) slash = parent + strlen(parent); (void) strncpy(zc.zc_name, parent, slash - parent); zc.zc_name[slash - parent] = '\0'; if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 && errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* check to see if the parent dataset exists */ while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { if (errno == ENOENT && accept_ancestor) { /* * Go deeper to find an ancestor, give up on top level. */ if (parent_name(parent, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } } else if (errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent does not exist")); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } else return (zfs_standard_error(hdl, errno, errbuf)); } is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned != NULL) *zoned = is_zoned; /* we are in a non-global zone, but parent is in the global zone */ if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { (void) zfs_standard_error(hdl, EPERM, errbuf); zfs_close(zhp); return (-1); } /* make sure parent is a filesystem */ if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent is not a filesystem")); (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (-1); } zfs_close(zhp); if (prefixlen != NULL) *prefixlen = strlen(parent); return (0); } /* * Finds whether the dataset of the given type(s) exists. */ boolean_t zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) { zfs_handle_t *zhp; if (!zfs_validate_name(hdl, path, types, B_FALSE)) return (B_FALSE); /* * Try to get stats for the dataset, which will tell us if it exists. */ if ((zhp = make_dataset_handle(hdl, path)) != NULL) { int ds_type = zhp->zfs_type; zfs_close(zhp); if (types & ds_type) return (B_TRUE); } return (B_FALSE); } /* * Given a path to 'target', create all the ancestors between * the prefixlen portion of the path, and the target itself. * Fail if the initial prefixlen-ancestor does not already exist. */ int create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) { zfs_handle_t *h; char *cp; const char *opname; /* make sure prefix exists */ cp = target + prefixlen; if (*cp != '/') { assert(strchr(cp, '/') == NULL); h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); } else { *cp = '\0'; h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); *cp = '/'; } if (h == NULL) return (-1); zfs_close(h); /* * Attempt to create, mount, and share any ancestor filesystems, * up to the prefixlen-long one. */ for (cp = target + prefixlen + 1; (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { *cp = '\0'; h = make_dataset_handle(hdl, target); if (h) { /* it already exists, nothing to do here */ zfs_close(h); continue; } if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "create"); goto ancestorerr; } h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); if (h == NULL) { opname = dgettext(TEXT_DOMAIN, "open"); goto ancestorerr; } if (zfs_mount(h, NULL, 0) != 0) { opname = dgettext(TEXT_DOMAIN, "mount"); goto ancestorerr; } if (zfs_share(h) != 0) { opname = dgettext(TEXT_DOMAIN, "share"); goto ancestorerr; } zfs_close(h); } zfs_commit_all_shares(); return (0); ancestorerr: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to %s ancestor '%s'"), opname, target); return (-1); } /* * Creates non-existing ancestors of the given path. */ int zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) { int prefix; char *path_copy; char errbuf[1024]; int rc = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* * Check that we are not passing the nesting limit * before we start creating any ancestors. */ if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) return (-1); if ((path_copy = strdup(path)) != NULL) { rc = create_parents(hdl, path_copy, prefix); free(path_copy); } if (path_copy == NULL || rc != 0) return (-1); return (0); } /* * Create a new filesystem or volume. */ int zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, nvlist_t *props) { int ret; uint64_t size = 0; uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); uint64_t zoned; enum lzc_dataset_type ost; zpool_handle_t *zpool_handle; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[1024]; char parent[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* validate the path, taking care to note the extended error message */ if (!zfs_validate_name(hdl, path, type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* validate parents exist */ if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) return (-1); /* * The failure modes when creating a dataset of a different type over * one that already exists is a little strange. In particular, if you * try to create a dataset on top of an existing dataset, the ioctl() * will return ENOENT, not EEXIST. To prevent this from happening, we * first try to see if the dataset exists. */ if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (type == ZFS_TYPE_VOLUME) ost = LZC_DATSET_TYPE_ZVOL; else ost = LZC_DATSET_TYPE_ZFS; /* open zpool handle for prop validation */ char pool_path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(pool_path, path, sizeof (pool_path)); /* truncate pool_path at first slash */ char *p = strchr(pool_path, '/'); if (p != NULL) *p = '\0'; if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL) return (-1); if (props && (props = zfs_valid_proplist(hdl, type, props, zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) { zpool_close(zpool_handle); return (-1); } zpool_close(zpool_handle); if (type == ZFS_TYPE_VOLUME) { /* * If we are creating a volume, the size and block size must * satisfy a few restraints. First, the blocksize must be a * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the * volsize must be a multiple of the block size, and cannot be * zero. */ if (props == NULL || nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if ((ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &blocksize)) != 0) { if (ret == ENOENT) { blocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); } else { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume block size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } if (size == 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size cannot be zero")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if (size % blocksize != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size must be a multiple of volume block " "size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } (void) parent_name(path, parent, sizeof (parent)); if (zfs_crypto_create(hdl, parent, props, NULL, B_TRUE, &wkeydata, &wkeylen) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } /* create the dataset */ ret = lzc_create(path, ost, props, wkeydata, wkeylen); nvlist_free(props); if (wkeydata != NULL) free(wkeydata); /* check for failure */ if (ret != 0) { switch (errno) { case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(hdl, EZFS_NOENT, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to set this " "property or value")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption root's key is not loaded " "or provided")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ERANGE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property value(s) specified")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); #ifdef _ILP32 case EOVERFLOW: /* * This platform can't address a volume this big. */ if (type == ZFS_TYPE_VOLUME) return (zfs_error(hdl, EZFS_VOLTOOBIG, errbuf)); fallthrough; #endif default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (0); } /* * Destroys the given dataset. The caller must make sure that the filesystem * isn't mounted, and that there are no active dependents. If the file system * does not exist this function does nothing. */ int zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { int error; if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer) return (EINVAL); if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_bookmarks(nv, NULL); fnvlist_free(nv); if (error != 0) { return (zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } return (0); } if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { error = lzc_destroy(zhp->zfs_name); } if (error != 0 && error != ENOENT) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } remove_mountpoint(zhp); return (0); } struct destroydata { nvlist_t *nvl; const char *snapname; }; static int zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, dd->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) verify(nvlist_add_boolean(dd->nvl, name) == 0); rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); zfs_close(zhp); return (rv); } /* * Destroys all snapshots with the given name in zhp & descendants. */ int zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); if (nvlist_empty(dd.nvl)) { ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), zhp->zfs_name, snapname); } else { ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); } nvlist_free(dd.nvl); return (ret); } /* * Destroys all the snapshots named in the nvlist. */ int zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) { - int ret; nvlist_t *errlist = NULL; nvpair_t *pair; + int ret = zfs_destroy_snaps_nvl_os(hdl, snaps); + if (ret != 0) + return (ret); + ret = lzc_destroy_snaps(snaps, defer, &errlist); if (ret == 0) { nvlist_free(errlist); return (0); } if (nvlist_empty(errlist)) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); ret = zfs_standard_error(hdl, ret, errbuf); } for (pair = nvlist_next_nvpair(errlist, NULL); pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), nvpair_name(pair)); switch (fnvpair_value_int32(pair)) { case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot is cloned")); ret = zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: ret = zfs_standard_error(hdl, errno, errbuf); break; } } nvlist_free(errlist); return (ret); } /* * Clones the given dataset. The target must be of the same type as the source. */ int zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) { char parent[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; uint64_t zoned; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), target); /* validate the target/clone name */ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents exist */ if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) return (-1); (void) parent_name(target, parent, sizeof (parent)); /* do the clone */ if (props) { zfs_type_t type; if (ZFS_IS_VOLUME(zhp)) { type = ZFS_TYPE_VOLUME; } else { type = ZFS_TYPE_FILESYSTEM; } if ((props = zfs_valid_proplist(hdl, type, props, zoned, zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL) return (-1); if (zfs_fix_auto_resv(zhp, props) == -1) { nvlist_free(props); return (-1); } } if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } ret = lzc_clone(target, zhp->zfs_name, props); nvlist_free(props); if (ret != 0) { switch (errno) { case ENOENT: /* * The parent doesn't exist. We should have caught this * above, but there may a race condition that has since * destroyed the parent. * * At this point, we don't know whether it's the source * that doesn't exist anymore, or whether the target * dataset doesn't exist. */ zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); case EXDEV: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "source and target pools differ")); return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, errbuf)); default: return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } } return (ret); } /* * Promotes the given clone fs to be the clone parent. */ int zfs_promote(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; char snapname[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot promote '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots can not be promoted")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (zhp->zfs_dmustats.dds_origin[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname)); if (ret != 0) { switch (ret) { case EACCES: /* * Promoting encrypted dataset outside its * encryption root. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot promote dataset outside its " "encryption root")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); case EEXIST: /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "conflicting snapshot '%s' from parent '%s'"), snapname, zhp->zfs_dmustats.dds_origin); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); default: return (zfs_standard_error(hdl, ret, errbuf)); } } return (ret); } typedef struct snapdata { nvlist_t *sd_nvl; const char *sd_snapname; } snapdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snapdata_t *sd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { if (snprintf(name, sizeof (name), "%s@%s", zfs_get_name(zhp), sd->sd_snapname) >= sizeof (name)) return (EINVAL); fnvlist_add_boolean(sd->sd_nvl, name); rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); } zfs_close(zhp); return (rv); } /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. */ int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) { int ret; char errbuf[1024]; nvpair_t *elem; nvlist_t *errors; zpool_handle_t *zpool_hdl; char pool[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshots ")); elem = NULL; while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { const char *snapname = nvpair_name(elem); /* validate the target name */ if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, B_TRUE)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), snapname); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } /* * get pool handle for prop validation. assumes all snaps are in the * same pool, as does lzc_snapshot (below). */ elem = nvlist_next_nvpair(snaps, NULL); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; zpool_hdl = zpool_open(hdl, pool); if (zpool_hdl == NULL) return (-1); if (props != NULL && (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) { zpool_close(zpool_hdl); return (-1); } zpool_close(zpool_hdl); ret = lzc_snapshot(snaps, props, &errors); if (ret != 0) { boolean_t printed = B_FALSE; for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), nvpair_name(elem)); (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); printed = B_TRUE; } if (!printed) { switch (ret) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple snapshots of same " "fs not allowed")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } } nvlist_free(props); nvlist_free(errors); return (ret); } int zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, nvlist_t *props) { int ret; snapdata_t sd = { 0 }; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; zfs_handle_t *zhp; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot snapshot %s"), path); if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); (void) strlcpy(fsname, path, sizeof (fsname)); cp = strchr(fsname, '@'); *cp = '\0'; sd.sd_snapname = cp + 1; if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { return (-1); } verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); if (recursive) { (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); } else { fnvlist_add_boolean(sd.sd_nvl, path); } ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); zfs_close(zhp); return (ret); } /* * Destroy any more recent snapshots. We invoke this callback on any dependents * of the snapshot first. If the 'cb_dependent' member is non-zero, then this * is a dependent and we should just destroy it without checking the transaction * group. */ typedef struct rollback_data { const char *cb_target; /* the snapshot */ uint64_t cb_create; /* creation time reference */ boolean_t cb_error; boolean_t cb_force; } rollback_data_t; static int rollback_destroy_dependent(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; prop_changelist_t *clp; /* We must destroy this clone; first unmount it */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, cbp->cb_force ? MS_FORCE: 0); if (clp == NULL || changelist_prefix(clp) != 0) { cbp->cb_error = B_TRUE; zfs_close(zhp); return (0); } if (zfs_destroy(zhp, B_FALSE) != 0) cbp->cb_error = B_TRUE; else changelist_remove(clp, zhp->zfs_name); (void) changelist_postfix(clp); changelist_free(clp); zfs_close(zhp); return (0); } static int rollback_destroy(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, rollback_destroy_dependent, cbp); cbp->cb_error |= zfs_destroy(zhp, B_FALSE); } zfs_close(zhp); return (0); } /* * Given a dataset, rollback to a specific snapshot, discarding any * data changes since then and making it the active dataset. * * Any snapshots and bookmarks more recent than the target are * destroyed, along with their dependents (i.e. clones). */ int zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) { rollback_data_t cb = { 0 }; int err; boolean_t restore_resv = 0; uint64_t old_volsize = 0, new_volsize; zfs_prop_t resv_prop = { 0 }; uint64_t min_txg = 0; assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || zhp->zfs_type == ZFS_TYPE_VOLUME); /* * Destroy all recent snapshots and their dependents. */ cb.cb_force = force; cb.cb_target = snap->zfs_name; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); if (cb.cb_create > 0) min_txg = cb.cb_create; (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb, min_txg, 0); (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb); if (cb.cb_error) return (-1); /* * Now that we have verified that the snapshot is the latest, * rollback to the given snapshot. */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); restore_resv = (old_volsize == zfs_prop_get_int(zhp, resv_prop)); } /* * Pass both the filesystem and the wanted snapshot names, * we would get an error back if the snapshot is destroyed or * a new snapshot is created before this request is processed. */ err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name); if (err != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), zhp->zfs_name); switch (err) { case EEXIST: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "there is a snapshot or bookmark more recent " "than '%s'"), snap->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf); break; case ESRCH: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is not found among snapshots of '%s'"), snap->zfs_name, zhp->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf); break; case EINVAL: (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf); } return (err); } /* * For volumes, if the pre-rollback volsize matched the pre- * rollback reservation and the volsize has changed then set * the reservation property to the post-rollback volsize. * Make a new handle since the rollback closed the dataset. */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) err = zfs_prop_set_int(zhp, resv_prop, new_volsize); } zfs_close(zhp); } return (err); } /* * Renames the given dataset. */ int zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags) { int ret = 0; zfs_cmd_t zc = {"\0"}; char *delim; prop_changelist_t *cl = NULL; char parent[ZFS_MAX_DATASET_NAME_LEN]; char property[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; /* if we have the same exact name, just return success */ if (strcmp(zhp->zfs_name, target) == 0) return (0); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename to '%s'"), target); /* make sure source name is valid */ if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* * Make sure the target name is valid */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { if ((strchr(target, '@') == NULL) || *target == '@') { /* * Snapshot target name is abbreviated, * reconstruct full dataset name */ (void) strlcpy(parent, zhp->zfs_name, sizeof (parent)); delim = strchr(parent, '@'); if (strchr(target, '@') == NULL) *(++delim) = '\0'; else *delim = '\0'; (void) strlcat(parent, target, sizeof (parent)); target = parent; } else { /* * Make sure we're renaming within the same dataset. */ delim = strchr(target, '@'); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '@') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots must be part of same " "dataset")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { if (flags.recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "recursive rename must be a snapshot")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents */ if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) return (-1); /* make sure we're in the same pool */ verify((delim = strchr(target, '/')) != NULL); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "datasets must be within same pool")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } /* new name cannot be a child of the current dataset name */ if (is_descendant(zhp->zfs_name, target)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "New dataset name cannot be a descendant of " "current dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); if (getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Avoid unmounting file systems with mountpoint property set to * 'legacy' or 'none' even if -u option is not given. */ if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && !flags.recursive && !flags.nounmount && zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && (strcmp(property, "legacy") == 0 || strcmp(property, "none") == 0)) { flags.nounmount = B_TRUE; } if (flags.recursive) { char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { ret = -1; goto error; } delim = strchr(parentname, '@'); *delim = '\0'; zfs_handle_t *zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); free(parentname); if (zhrp == NULL) { ret = -1; goto error; } zfs_close(zhrp); } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, flags.nounmount ? CL_GATHER_DONT_UNMOUNT : CL_GATHER_ITER_MOUNTED, flags.forceunmount ? MS_FORCE : 0)) == NULL) return (-1); if (changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); (void) zfs_error(hdl, EZFS_ZONED, errbuf); ret = -1; goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; } if (ZFS_IS_VOLUME(zhp)) zc.zc_objset_type = DMU_OST_ZVOL; else zc.zc_objset_type = DMU_OST_ZFS; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); zc.zc_cookie = !!flags.recursive; zc.zc_cookie |= (!!flags.nounmount) << 1; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { /* * if it was recursive, the one that actually failed will * be in zc.zc_name */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zc.zc_name); if (flags.recursive && errno == EEXIST) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "a child dataset already has a snapshot " "with the new name")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else if (errno == EACCES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot move encrypted child outside of " "its encryption root")); (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); } else { (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); } /* * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ if (cl != NULL) (void) changelist_postfix(cl); } else { if (cl != NULL) { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } } error: if (cl != NULL) { changelist_free(cl); } return (ret); } nvlist_t * zfs_get_all_props(zfs_handle_t *zhp) { return (zhp->zfs_props); } nvlist_t * zfs_get_recvd_props(zfs_handle_t *zhp) { if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (NULL); return (zhp->zfs_recvd_props); } nvlist_t * zfs_get_user_props(zfs_handle_t *zhp) { return (zhp->zfs_user_props); } /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: * * - If this is a list of all properties, then expand the list to include * all native properties, and set a flag so that for each dataset we look * for new unique user properties and add them to the list. * * - For non fixed-width properties, keep track of the maximum width seen * so that we can size the column appropriately. If the user has * requested received property values, we also need to compute the width * of the RECEIVED column. */ int zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, boolean_t literal) { libzfs_handle_t *hdl = zhp->zfs_hdl; zprop_list_t *entry; zprop_list_t **last, **start; nvlist_t *userprops, *propval; nvpair_t *elem; char *strval; char buf[ZFS_MAXPROPLEN]; if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) return (-1); userprops = zfs_get_user_props(zhp); entry = *plp; if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { /* * Go through and add any user properties as necessary. We * start by incrementing our list pointer to the first * non-native property. */ start = plp; while (*start != NULL) { if ((*start)->pl_prop == ZPROP_INVAL) break; start = &(*start)->pl_next; } elem = NULL; while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { /* * See if we've already found this property in our list. */ for (last = start; *last != NULL; last = &(*last)->pl_next) { if (strcmp((*last)->pl_user_prop, nvpair_name(elem)) == 0) break; } if (*last == NULL) { if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL || ((entry->pl_user_prop = zfs_strdup(hdl, nvpair_name(elem)))) == NULL) { free(entry); return (-1); } entry->pl_prop = ZPROP_INVAL; entry->pl_width = strlen(nvpair_name(elem)); entry->pl_all = B_TRUE; *last = entry; } } } /* * Now go through and check the width of any non-fixed columns */ for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, entry->pl_prop, buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (received && zfs_prop_get_recvd(zhp, zfs_prop_to_name(entry->pl_prop), buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } else { if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, &propval) == 0) { verify(nvlist_lookup_string(propval, ZPROP_VALUE, &strval) == 0); if (strlen(strval) > entry->pl_width) entry->pl_width = strlen(strval); } if (received && zfs_prop_get_recvd(zhp, entry->pl_user_prop, buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } } return (0); } void zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) { nvpair_t *curr; nvpair_t *next; /* * Keep a reference to the props-table against which we prune the * properties. */ zhp->zfs_props_table = props; curr = nvlist_next_nvpair(zhp->zfs_props, NULL); while (curr) { zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); next = nvlist_next_nvpair(zhp->zfs_props, curr); /* * User properties will result in ZPROP_INVAL, and since we * only know how to prune standard ZFS properties, we always * leave these in the list. This can also happen if we * encounter an unknown DSL property (when running older * software, for example). */ if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE) (void) nvlist_remove(zhp->zfs_props, nvpair_name(curr), nvpair_type(curr)); curr = next; } } static int zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, zfs_smb_acl_op_t cmd, char *resource1, char *resource2) { zfs_cmd_t zc = {"\0"}; nvlist_t *nvlist = NULL; int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); zc.zc_cookie = (uint64_t)cmd; if (cmd == ZFS_SMB_ACL_RENAME) { if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (0); } } switch (cmd) { case ZFS_SMB_ACL_ADD: case ZFS_SMB_ACL_REMOVE: (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); break; case ZFS_SMB_ACL_RENAME: if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, resource1) != 0) { (void) no_memory(hdl); return (-1); } if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, resource2) != 0) { (void) no_memory(hdl); return (-1); } if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) { nvlist_free(nvlist); return (-1); } break; case ZFS_SMB_ACL_PURGE: break; default: return (-1); } error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); nvlist_free(nvlist); return (error); } int zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, resource, NULL)); } int zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, resource, NULL)); } int zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, NULL, NULL)); } int zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, char *oldname, char *newname) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, oldname, newname)); } int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, zfs_userspace_cb_t func, void *arg) { zfs_cmd_t zc = {"\0"}; zfs_useracct_t buf[100]; libzfs_handle_t *hdl = zhp->zfs_hdl; int ret; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = type; zc.zc_nvlist_dst = (uintptr_t)buf; for (;;) { zfs_useracct_t *zua = buf; zc.zc_nvlist_dst_size = sizeof (buf); if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { if ((errno == ENOTSUP && (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA))) break; return (zfs_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get used/quota for %s"), zc.zc_name)); } if (zc.zc_nvlist_dst_size == 0) break; while (zc.zc_nvlist_dst_size > 0) { if ((ret = func(arg, zua->zu_domain, zua->zu_rid, zua->zu_space)) != 0) return (ret); zua++; zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); } } return (0); } struct holdarg { nvlist_t *nvl; const char *snapname; const char *tag; boolean_t recursive; int error; }; static int zfs_hold_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_string(ha->nvl, name, ha->tag); if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); zfs_close(zhp); return (rv); } int zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive, int cleanup_fd) { int ret; struct holdarg ha; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { char errbuf[1024]; fnvlist_free(ha.nvl); ret = ENOENT; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s@%s'"), zhp->zfs_name, snapname); (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); return (ret); } ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); fnvlist_free(ha.nvl); return (ret); } int zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) { int ret; nvlist_t *errors; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; nvpair_t *elem; errors = NULL; ret = lzc_hold(holds, cleanup_fd, &errors); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold")); switch (ret) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case E2BIG: /* * Temporary tags wind up having the ds object id * prepended. So even if we passed the length check * above, it's still possible for the tag to wind * up being slightly too long. */ (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case EEXIST: (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } static int zfs_release_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; nvlist_t *existing_holds; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) { ha->error = EINVAL; rv = EINVAL; } if (lzc_get_holds(name, &existing_holds) != 0) { ha->error = ENOENT; } else if (!nvlist_exists(existing_holds, ha->tag)) { ha->error = ESRCH; } else { nvlist_t *torelease = fnvlist_alloc(); fnvlist_add_boolean(torelease, ha->tag); fnvlist_add_nvlist(ha->nvl, name, torelease); fnvlist_free(torelease); } if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); zfs_close(zhp); return (rv); } int zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive) { int ret; struct holdarg ha; nvlist_t *errors = NULL; nvpair_t *elem; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; ha.error = 0; (void) zfs_release_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { fnvlist_free(ha.nvl); ret = ha.error; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s@%s'"), zhp->zfs_name, snapname); if (ret == ESRCH) { (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); } else { (void) zfs_standard_error(hdl, ret, errbuf); } return (ret); } ret = lzc_release(ha.nvl, &errors); fnvlist_free(ha.nvl); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release")); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zfs_standard_error(hdl, errno, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case ESRCH: (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } int zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; int nvsz = 2048; void *nvbuf; int err = 0; char errbuf[1024]; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); tryagain: nvbuf = malloc(nvsz); if (nvbuf == NULL) { err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); goto out; } zc.zc_nvlist_dst_size = nvsz; zc.zc_nvlist_dst = (uintptr_t)nvbuf; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); switch (errno) { case ENOMEM: free(nvbuf); nvsz = zc.zc_nvlist_dst_size; goto tryagain; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } else { /* success */ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); if (rc) { err = zfs_standard_error_fmt(hdl, rc, dgettext( TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); } } free(nvbuf); out: return (err); } int zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; char *nvbuf; char errbuf[1024]; size_t nvsz; int err; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); assert(err == 0); nvbuf = malloc(nvsz); err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); assert(err == 0); zc.zc_nvlist_src_size = nvsz; zc.zc_nvlist_src = (uintptr_t)nvbuf; zc.zc_perm_action = un; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), zc.zc_name); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } free(nvbuf); return (err); } int zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) { int err; char errbuf[1024]; err = lzc_get_holds(zhp->zfs_name, nvl); if (err != 0) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), zhp->zfs_name); switch (err) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } return (err); } /* * The theory of raidz space accounting * * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block * will "reference" 128KB, even though it allocates more than that, to store the * parity information (and perhaps skip sectors). This concept of the * "referenced" (and other DMU space accounting) being lower than the allocated * space by a constant factor is called "raidz deflation." * * As mentioned above, the constant factor for raidz deflation assumes a 128KB * block size. However, zvols typically have a much smaller block size (default * 8KB). These smaller blocks may require proportionally much more parity * information (and perhaps skip sectors). In this case, the change to the * "referenced" property may be much more than the logical block size. * * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written * as follows. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D8 | D16 | D24 | * | P1 | D1 | D9 | D17 | D25 | * | P2 | D2 | D10 | D18 | D26 | * | P3 | D3 | D11 | D19 | D27 | * | P4 | D4 | D12 | D20 | D28 | * | P5 | D5 | D13 | D21 | D29 | * | P6 | D6 | D14 | D22 | D30 | * | P7 | D7 | D15 | D23 | D31 | * +-------+-------+-------+-------+-------+ * * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data * sectors. The dataset's referenced will increase by 128k and the pool's * allocated and free properties will be adjusted by 160k. * * A 4k block written to the same raidz vdev will require two 4k sectors. The * blank cells represent unallocated space. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | | | | * +-------+-------+-------+-------+-------+ * * Above, notice that the 4k block required one sector for parity and another * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated * and free properties will be adjusted by 8k. The dataset will not be charged * 8k. Rather, it will be charged a value that is scaled according to the * overhead of the 128k block on the same vdev. This 8k allocation will be * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as * calculated in the 128k block example above. * * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 * allocations are a multiple of 3 sectors, and raidz3 allocations are a * multiple of of 4 sectors. When a block does not fill the required number of * sectors, skip blocks (sectors) are used. * * An 8k block being written to a raidz vdev may be written as follows: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | | * +-------+-------+-------+-------+-------+ * * In order to maintain the nparity+1 allocation size, a skip block (S0) was * added. For this 8k block, the pool's allocated and free properties are * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * * The situation is slightly different for dRAID since the minimum allocation * size is the full group width. The same 8K block above would be written as * follows in a dRAID group: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | S1 | * +-------+-------+-------+-------+-------+ * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. * Note that metadata blocks will typically be compressed, so the reservation * size returned by zvol_volsize_to_reservation() will generally be slightly * larger than the maximum that the volume can reference. */ /* * Derived from function of same name in module/zfs/vdev_raidz.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. Note that the "referenced" space accounted will be less than this, but * not necessarily equal to "blksize", due to RAIDZ deflation. */ static uint64_t vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { uint64_t asize, ndata; ASSERT3U(ndisks, >, nparity); ndata = ndisks - nparity; asize = ((blksize - 1) >> ashift) + 1; asize += nparity * ((asize + ndata - 1) / ndata); asize = roundup(asize, nparity + 1) << ashift; return (asize); } /* * Derived from function of same name in module/zfs/vdev_draid.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. */ static uint64_t vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { ASSERT3U(ndisks, >, nparity); uint64_t ndata = ndisks - nparity; uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; uint64_t asize = (rows * ndisks) << ashift; return (asize); } /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one * copy of the volume data. See theory comment above. */ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (nblocks * blksize); } for (int v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, &type) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && strcmp(type, VDEV_TYPE_DRAID) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, &nparity) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { nvlist_t **disks; uint_t ndisks; if (nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) continue; /* allocation size for the "typical" 128k block */ tsize = vdev_raidz_asize(ndisks, nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); } else { uint64_t ndata; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) continue; /* allocation size for the "typical" 128k block */ tsize = vdev_draid_asize(ndata + nparity, nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ asize = vdev_draid_asize(ndata + nparity, nparity, ashift, blksize); } /* * Scale this size down as a ratio of 128k / tsize. * See theory statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { ret = volsize; } } if (ret == 0) { ret = nblocks * blksize; } return (ret); } /* * Convert the zvol's volume size to an appropriate reservation. See theory * comment above. * * Note: If this routine is updated, it is necessary to update the ZFS test * suite's shell version in reservation.shlib. */ uint64_t zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, nvlist_t *props) { uint64_t numdb; uint64_t nblocks, volblocksize; int ncopies; char *strval; if (nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) ncopies = atoi(strval); else ncopies = 1; if (nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; nblocks = volsize / volblocksize; /* * Metadata defaults to using 128k blocks, not volblocksize blocks. For * this reason, only the data blocks are scaled based on vdev config. */ volsize = volsize_from_vdevs(zph, nblocks, volblocksize); /* start with metadnode L0-L6 */ numdb = 7; /* calculate number of indirects */ while (nblocks > 1) { nblocks += DNODES_PER_LEVEL - 1; nblocks /= DNODES_PER_LEVEL; numdb += nblocks; } numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); volsize *= ncopies; /* * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't * compressed, but in practice they compress down to about * 1100 bytes */ numdb *= 1ULL << DN_MAX_INDBLKSHIFT; volsize += numdb; return (volsize); } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent fses are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zfs's wait cmd). In that * scenario, a fs being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the fs doesn't * exist. */ int zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait_fs(zhp->zfs_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), zhp->zfs_name); } return (error); } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_iter.c b/sys/contrib/openzfs/lib/libzfs/libzfs_iter.c index 7806e21cd9a9..3c537be79487 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_iter.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_iter.c @@ -1,600 +1,603 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2019 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include "libzfs_impl.h" static int zfs_iter_clones(zfs_handle_t *zhp, zfs_iter_f func, void *data) { nvlist_t *nvl = zfs_get_clones_nvl(zhp); nvpair_t *pair; if (nvl == NULL) return (0); for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { zfs_handle_t *clone = zfs_open(zhp->zfs_hdl, nvpair_name(pair), ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (clone != NULL) { int err = func(clone, data); if (err != 0) return (err); } } return (0); } static int zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) { int rc; uint64_t orig_cookie; orig_cookie = zc->zc_cookie; top: (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); rc = zfs_ioctl(zhp->zfs_hdl, arg, zc); if (rc == -1) { switch (errno) { case ENOMEM: /* expand nvlist memory and try again */ if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { zcmd_free_nvlists(zc); return (-1); } zc->zc_cookie = orig_cookie; goto top; /* * An errno value of ESRCH indicates normal completion. * If ENOENT is returned, then the underlying dataset * has been removed since we obtained the handle. */ case ESRCH: case ENOENT: rc = 1; break; default: rc = zfs_standard_error(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot iterate filesystems")); break; } } return (rc); } /* * Iterate over all child filesystems */ int zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *nzhp; int ret; if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) return (0); if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) return (-1); while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, &zc)) == 0) { /* * Silently ignore errors, as the only plausible explanation is * that the pool has since been removed. */ if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, &zc)) == NULL) { continue; } if ((ret = func(nzhp, data)) != 0) { zcmd_free_nvlists(&zc); return (ret); } } zcmd_free_nvlists(&zc); return ((ret < 0) ? ret : 0); } /* * Iterate over all snapshots */ int zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func, void *data, uint64_t min_txg, uint64_t max_txg) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *nzhp; int ret; nvlist_t *range_nvl = NULL; if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT || zhp->zfs_type == ZFS_TYPE_BOOKMARK) return (0); zc.zc_simple = simple; if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) return (-1); if (min_txg != 0) { range_nvl = fnvlist_alloc(); fnvlist_add_uint64(range_nvl, SNAP_ITER_MIN_TXG, min_txg); } if (max_txg != 0) { if (range_nvl == NULL) range_nvl = fnvlist_alloc(); fnvlist_add_uint64(range_nvl, SNAP_ITER_MAX_TXG, max_txg); } if (range_nvl != NULL && zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, range_nvl) != 0) { zcmd_free_nvlists(&zc); fnvlist_free(range_nvl); return (-1); } while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc)) == 0) { if (simple) nzhp = make_dataset_simple_handle_zc(zhp, &zc); else nzhp = make_dataset_handle_zc(zhp->zfs_hdl, &zc); if (nzhp == NULL) continue; if ((ret = func(nzhp, data)) != 0) { zcmd_free_nvlists(&zc); fnvlist_free(range_nvl); return (ret); } } zcmd_free_nvlists(&zc); fnvlist_free(range_nvl); return ((ret < 0) ? ret : 0); } /* * Iterate over all bookmarks */ int zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data) { zfs_handle_t *nzhp; nvlist_t *props = NULL; nvlist_t *bmarks = NULL; int err; nvpair_t *pair; if ((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) != 0) return (0); /* Setup the requested properties nvlist. */ props = fnvlist_alloc(); for (zfs_prop_t p = 0; p < ZFS_NUM_PROPS; p++) { if (zfs_prop_valid_for_type(p, ZFS_TYPE_BOOKMARK, B_FALSE)) { fnvlist_add_boolean(props, zfs_prop_to_name(p)); } } fnvlist_add_boolean(props, "redact_complete"); if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0) goto out; for (pair = nvlist_next_nvpair(bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { char name[ZFS_MAX_DATASET_NAME_LEN]; char *bmark_name; nvlist_t *bmark_props; bmark_name = nvpair_name(pair); bmark_props = fnvpair_value_nvlist(pair); if (snprintf(name, sizeof (name), "%s#%s", zhp->zfs_name, bmark_name) >= sizeof (name)) { err = EINVAL; goto out; } nzhp = make_bookmark_handle(zhp, name, bmark_props); if (nzhp == NULL) continue; if ((err = func(nzhp, data)) != 0) goto out; } out: fnvlist_free(props); fnvlist_free(bmarks); return (err); } /* * Routines for dealing with the sorted snapshot functionality */ typedef struct zfs_node { zfs_handle_t *zn_handle; avl_node_t zn_avlnode; } zfs_node_t; static int zfs_sort_snaps(zfs_handle_t *zhp, void *data) { avl_tree_t *avl = data; zfs_node_t *node; zfs_node_t search; search.zn_handle = zhp; node = avl_find(avl, &search, NULL); if (node) { /* * If this snapshot was renamed while we were creating the * AVL tree, it's possible that we already inserted it under * its old name. Remove the old handle before adding the new * one. */ zfs_close(node->zn_handle); avl_remove(avl, node); free(node); } node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); node->zn_handle = zhp; avl_add(avl, node); return (0); } static int zfs_snapshot_compare(const void *larg, const void *rarg) { zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; uint64_t lcreate, rcreate; /* * Sort them according to creation time. We use the hidden * CREATETXG property to get an absolute ordering of snapshots. */ lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); return (TREE_CMP(lcreate, rcreate)); } int zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data, uint64_t min_txg, uint64_t max_txg) { int ret = 0; zfs_node_t *node; avl_tree_t avl; void *cookie = NULL; avl_create(&avl, zfs_snapshot_compare, sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); ret = zfs_iter_snapshots(zhp, B_FALSE, zfs_sort_snaps, &avl, min_txg, max_txg); for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) ret |= callback(node->zn_handle, data); while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) free(node); avl_destroy(&avl); return (ret); } typedef struct { char *ssa_first; char *ssa_last; boolean_t ssa_seenfirst; boolean_t ssa_seenlast; zfs_iter_f ssa_func; void *ssa_arg; } snapspec_arg_t; static int snapspec_cb(zfs_handle_t *zhp, void *arg) { snapspec_arg_t *ssa = arg; const char *shortsnapname; int err = 0; if (ssa->ssa_seenlast) return (0); shortsnapname = strchr(zfs_get_name(zhp), '@') + 1; if (!ssa->ssa_seenfirst && strcmp(shortsnapname, ssa->ssa_first) == 0) ssa->ssa_seenfirst = B_TRUE; if (strcmp(shortsnapname, ssa->ssa_last) == 0) ssa->ssa_seenlast = B_TRUE; if (ssa->ssa_seenfirst) { err = ssa->ssa_func(zhp, ssa->ssa_arg); } else { zfs_close(zhp); } return (err); } /* * spec is a string like "A,B%C,D" * * , where can be: * (single snapshot) * % (range of snapshots, inclusive) * % (range of snapshots, starting with earliest) * % (range of snapshots, ending with last) * % (all snapshots) * [,...] (comma separated list of the above) * * If a snapshot can not be opened, continue trying to open the others, but * return ENOENT at the end. */ int zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, zfs_iter_f func, void *arg) { char *buf, *comma_separated, *cp; int err = 0; int ret = 0; buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig); cp = buf; while ((comma_separated = strsep(&cp, ",")) != NULL) { char *pct = strchr(comma_separated, '%'); if (pct != NULL) { snapspec_arg_t ssa = { 0 }; ssa.ssa_func = func; ssa.ssa_arg = arg; if (pct == comma_separated) ssa.ssa_seenfirst = B_TRUE; else ssa.ssa_first = comma_separated; *pct = '\0'; ssa.ssa_last = pct + 1; /* * If there is a lastname specified, make sure it * exists. */ if (ssa.ssa_last[0] != '\0') { char snapname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(snapname, sizeof (snapname), "%s@%s", zfs_get_name(fs_zhp), ssa.ssa_last); if (!zfs_dataset_exists(fs_zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT)) { ret = ENOENT; continue; } } err = zfs_iter_snapshots_sorted(fs_zhp, snapspec_cb, &ssa, 0, 0); if (ret == 0) ret = err; if (ret == 0 && (!ssa.ssa_seenfirst || (ssa.ssa_last[0] != '\0' && !ssa.ssa_seenlast))) { ret = ENOENT; } } else { char snapname[ZFS_MAX_DATASET_NAME_LEN]; zfs_handle_t *snap_zhp; (void) snprintf(snapname, sizeof (snapname), "%s@%s", zfs_get_name(fs_zhp), comma_separated); snap_zhp = make_dataset_handle(fs_zhp->zfs_hdl, snapname); if (snap_zhp == NULL) { ret = ENOENT; continue; } err = func(snap_zhp, arg); if (ret == 0) ret = err; } } free(buf); return (ret); } /* * Iterate over all children, snapshots and filesystems * Process snapshots before filesystems because they are nearer the input * handle: this is extremely important when used with zfs_iter_f functions * looking for data, following the logic that we would like to find it as soon * and as close as possible. */ int zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) { int ret; if ((ret = zfs_iter_snapshots(zhp, B_FALSE, func, data, 0, 0)) != 0) return (ret); return (zfs_iter_filesystems(zhp, func, data)); } typedef struct iter_stack_frame { struct iter_stack_frame *next; zfs_handle_t *zhp; } iter_stack_frame_t; typedef struct iter_dependents_arg { boolean_t first; boolean_t allowrecursion; iter_stack_frame_t *stack; zfs_iter_f func; void *data; } iter_dependents_arg_t; static int iter_dependents_cb(zfs_handle_t *zhp, void *arg) { iter_dependents_arg_t *ida = arg; int err = 0; boolean_t first = ida->first; ida->first = B_FALSE; if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { err = zfs_iter_clones(zhp, iter_dependents_cb, ida); } else if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) { iter_stack_frame_t isf; iter_stack_frame_t *f; /* * check if there is a cycle by seeing if this fs is already * on the stack. */ for (f = ida->stack; f != NULL; f = f->next) { if (f->zhp->zfs_dmustats.dds_guid == zhp->zfs_dmustats.dds_guid) { if (ida->allowrecursion) { zfs_close(zhp); return (0); } else { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "recursive dependency at '%s'"), zfs_get_name(zhp)); err = zfs_error(zhp->zfs_hdl, EZFS_RECURSIVE, dgettext(TEXT_DOMAIN, "cannot determine dependent " "datasets")); zfs_close(zhp); return (err); } } } isf.zhp = zhp; isf.next = ida->stack; ida->stack = &isf; err = zfs_iter_filesystems(zhp, iter_dependents_cb, ida); if (err == 0) err = zfs_iter_snapshots(zhp, B_FALSE, iter_dependents_cb, ida, 0, 0); ida->stack = isf.next; } if (!first && err == 0) err = ida->func(zhp, ida->data); else zfs_close(zhp); return (err); } int zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, zfs_iter_f func, void *data) { iter_dependents_arg_t ida; ida.allowrecursion = allowrecursion; ida.stack = NULL; ida.func = func; ida.data = data; ida.first = B_TRUE; return (iter_dependents_cb(zfs_handle_dup(zhp), &ida)); } /* * Iterate over mounted children of the specified dataset */ int zfs_iter_mounted(zfs_handle_t *zhp, zfs_iter_f func, void *data) { char mnt_prop[ZFS_MAXPROPLEN]; struct mnttab entry; zfs_handle_t *mtab_zhp; size_t namelen = strlen(zhp->zfs_name); FILE *mnttab; int err = 0; if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); while (err == 0 && getmntent(mnttab, &entry) == 0) { /* Ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* Ignore datasets not within the provided dataset */ if (strncmp(entry.mnt_special, zhp->zfs_name, namelen) != 0 || - (entry.mnt_special[namelen] != '/' && - entry.mnt_special[namelen] != '@')) + entry.mnt_special[namelen] != '/') + continue; + + /* Skip snapshot of any child dataset */ + if (strchr(entry.mnt_special, '@') != NULL) continue; if ((mtab_zhp = zfs_open(zhp->zfs_hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) continue; /* Ignore legacy mounts as they are user managed */ verify(zfs_prop_get(mtab_zhp, ZFS_PROP_MOUNTPOINT, mnt_prop, sizeof (mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(mnt_prop, "legacy") == 0) { zfs_close(mtab_zhp); continue; } err = func(mtab_zhp, data); } fclose(mnttab); return (err); } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c index bee0aff4b49c..e95f28088df6 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c @@ -1,5190 +1,5212 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" #include #include #include #include #include #include #include static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, const char *, nvlist_t *); static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; boolean_t pa_estimate; int pa_verbosity; } progress_arg_t; static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } /* * Routines for dealing with the AVL tree of fs-nvlists */ typedef struct fsavl_node { avl_node_t fn_node; nvlist_t *fn_nvfs; char *fn_snapname; uint64_t fn_guid; } fsavl_node_t; static int fsavl_compare(const void *arg1, const void *arg2) { const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); } /* * Given the GUID of a snapshot, find its containing filesystem and * (optionally) name. */ static nvlist_t * fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) { fsavl_node_t fn_find; fsavl_node_t *fn; fn_find.fn_guid = snapguid; fn = avl_find(avl, &fn_find, NULL); if (fn) { if (snapname) *snapname = fn->fn_snapname; return (fn->fn_nvfs); } return (NULL); } static void fsavl_destroy(avl_tree_t *avl) { fsavl_node_t *fn; void *cookie; if (avl == NULL) return; cookie = NULL; while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) free(fn); avl_destroy(avl); free(avl); } /* * Given an nvlist, produce an avl tree of snapshots, ordered by guid */ static avl_tree_t * fsavl_create(nvlist_t *fss) { avl_tree_t *fsavl; nvpair_t *fselem = NULL; if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) return (NULL); avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), offsetof(fsavl_node_t, fn_node)); while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { nvlist_t *nvfs, *snaps; nvpair_t *snapelem = NULL; nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); while ((snapelem = nvlist_next_nvpair(snaps, snapelem)) != NULL) { fsavl_node_t *fn; uint64_t guid; guid = fnvpair_value_uint64(snapelem); if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { fsavl_destroy(fsavl); return (NULL); } fn->fn_nvfs = nvfs; fn->fn_snapname = nvpair_name(snapelem); fn->fn_guid = guid; /* * Note: if there are multiple snaps with the * same GUID, we ignore all but one. */ if (avl_find(fsavl, fn, NULL) == NULL) avl_add(fsavl, fn); else free(fn); } } return (fsavl); } /* * Routines for dealing with the giant nvlist of fs-nvlists, etc. */ typedef struct send_data { /* * assigned inside every recursive call, * restored from *_save on return: * * guid of fromsnap snapshot in parent dataset * txg of fromsnap snapshot in current dataset * txg of tosnap snapshot in current dataset */ uint64_t parent_fromsnap_guid; uint64_t fromsnap_txg; uint64_t tosnap_txg; /* the nvlists get accumulated during depth-first traversal */ nvlist_t *parent_snaps; nvlist_t *fss; nvlist_t *snapprops; nvlist_t *snapholds; /* user holds */ /* send-receive configuration, does not change during traversal */ const char *fsname; const char *fromsnap; const char *tosnap; boolean_t recursive; boolean_t raw; boolean_t doall; boolean_t replicate; boolean_t skipmissing; boolean_t verbose; boolean_t backup; boolean_t seenfrom; boolean_t seento; boolean_t holds; /* were holds requested with send -h */ boolean_t props; /* * The header nvlist is of the following format: * { * "tosnap" -> string * "fromsnap" -> string (if incremental) * "fss" -> { * id -> { * * "name" -> string (full name; for debugging) * "parentfromsnap" -> number (guid of fromsnap in parent) * * "props" -> { name -> value (only if set here) } * "snaps" -> { name (lastname) -> number (guid) } * "snapprops" -> { name (lastname) -> { name -> value } } * "snapholds" -> { name (lastname) -> { holdname -> crtime } } * * "origin" -> number (guid) (if clone) * "is_encroot" -> boolean * "sent" -> boolean (not on-disk) * } * } * } * */ } send_data_t; static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv); static int send_iterate_snap(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; char *snapname; nvlist_t *nv; boolean_t isfromsnap, istosnap, istosnapwithnofrom; snapname = strrchr(zhp->zfs_name, '@')+1; isfromsnap = (sd->fromsnap != NULL && strcmp(sd->fromsnap, snapname) == 0); istosnap = (sd->tosnap != NULL && (strcmp(sd->tosnap, snapname) == 0)); istosnapwithnofrom = (istosnap && sd->fromsnap == NULL); if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping snapshot %s because it was created " "after the destination snapshot (%s)\n"), zhp->zfs_name, sd->tosnap); } zfs_close(zhp); return (0); } fnvlist_add_uint64(sd->parent_snaps, snapname, guid); /* * NB: if there is no fromsnap here (it's a newly created fs in * an incremental replication), we will substitute the tosnap. */ if (isfromsnap || (sd->parent_fromsnap_guid == 0 && istosnap)) { sd->parent_fromsnap_guid = guid; } if (!sd->recursive) { /* * To allow a doall stream to work properly * with a NULL fromsnap */ if (sd->doall && sd->fromsnap == NULL && !sd->seenfrom) { sd->seenfrom = B_TRUE; } if (!sd->seenfrom && isfromsnap) { sd->seenfrom = B_TRUE; zfs_close(zhp); return (0); } if ((sd->seento || !sd->seenfrom) && !istosnapwithnofrom) { zfs_close(zhp); return (0); } if (istosnap) sd->seento = B_TRUE; } nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); fnvlist_add_nvlist(sd->snapprops, snapname, nv); fnvlist_free(nv); if (sd->holds) { nvlist_t *holds = fnvlist_alloc(); int err = lzc_get_holds(zhp->zfs_name, &holds); if (err == 0) { fnvlist_add_nvlist(sd->snapholds, snapname, holds); } fnvlist_free(holds); } zfs_close(zhp); return (0); } static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) { nvlist_t *props = NULL; nvpair_t *elem = NULL; if (received_only) props = zfs_get_recvd_props(zhp); else props = zhp->zfs_props; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { char *propname = nvpair_name(elem); zfs_prop_t prop = zfs_name_to_prop(propname); nvlist_t *propnv; if (!zfs_prop_user(propname)) { /* * Realistically, this should never happen. However, * we want the ability to add DSL properties without * needing to make incompatible version changes. We * need to ignore unknown properties to allow older * software to still send datasets containing these * properties, with the unknown properties elided. */ if (prop == ZPROP_INVAL) continue; if (zfs_prop_readonly(prop)) continue; } verify(nvpair_value_nvlist(elem, &propnv) == 0); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { char *source; uint64_t value; verify(nvlist_lookup_uint64(propnv, ZPROP_VALUE, &value) == 0); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) continue; /* * May have no source before SPA_VERSION_RECVD_PROPS, * but is still modifiable. */ if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) == 0) { if ((strcmp(source, zhp->zfs_name) != 0) && (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) continue; } } else { char *source; if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) != 0) continue; if ((strcmp(source, zhp->zfs_name) != 0) && (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) continue; } if (zfs_prop_user(propname) || zfs_prop_get_type(prop) == PROP_TYPE_STRING) { char *value; value = fnvlist_lookup_string(propnv, ZPROP_VALUE); fnvlist_add_string(nv, propname, value); } else { uint64_t value; value = fnvlist_lookup_uint64(propnv, ZPROP_VALUE); fnvlist_add_uint64(nv, propname, value); } } } /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t txg = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (txg); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); zfs_close(zhp); } } return (txg); } /* * recursively generate nvlists describing datasets. See comment * for the data structure send_data_t above for description of contents * of the nvlist. */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; nvlist_t *nvfs = NULL, *nv = NULL; int rv = 0; uint64_t min_txg = 0, max_txg = 0; uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; uint64_t fromsnap_txg_save = sd->fromsnap_txg; uint64_t tosnap_txg_save = sd->tosnap_txg; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t fromsnap_txg, tosnap_txg; char guidstring[64]; fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); if (fromsnap_txg != 0) sd->fromsnap_txg = fromsnap_txg; tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); if (tosnap_txg != 0) sd->tosnap_txg = tosnap_txg; /* * on the send side, if the current dataset does not have tosnap, * perform two additional checks: * * - skip sending the current dataset if it was created later than * the parent tosnap * - return error if the current dataset was created earlier than * the parent tosnap, unless --skip-missing specified. Then * just print a warning */ if (sd->tosnap != NULL && tosnap_txg == 0) { if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping dataset %s: snapshot %s does " "not exist\n"), zhp->zfs_name, sd->tosnap); } } else if (sd->skipmissing) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: skipping dataset %s and its children:" " snapshot %s does not exist\n"), zhp->zfs_name, sd->tosnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s%s: snapshot %s@%s does not " "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); rv = EZFS_NOENT; } goto out; } nvfs = fnvlist_alloc(); fnvlist_add_string(nvfs, "name", zhp->zfs_name); fnvlist_add_uint64(nvfs, "parentfromsnap", sd->parent_fromsnap_guid); if (zhp->zfs_dmustats.dds_origin[0]) { zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); if (origin == NULL) { rv = -1; goto out; } fnvlist_add_uint64(nvfs, "origin", origin->zfs_dmustats.dds_guid); zfs_close(origin); } /* iterate over props */ if (sd->props || sd->backup || sd->recursive) { nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); } if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { boolean_t encroot; /* determine if this dataset is an encryption root */ if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0) { rv = -1; goto out; } if (encroot) fnvlist_add_boolean(nvfs, "is_encroot"); /* * Encrypted datasets can only be sent with properties if * the raw flag is specified because the receive side doesn't * currently have a mechanism for recursively asking the user * for new encryption parameters. */ if (!sd->raw) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s: encrypted dataset %s may not " "be sent with properties without the raw flag\n"), sd->fsname, sd->tosnap, zhp->zfs_name); rv = -1; goto out; } } if (nv != NULL) fnvlist_add_nvlist(nvfs, "props", nv); /* iterate over snaps, and set sd->parent_fromsnap_guid */ sd->parent_fromsnap_guid = 0; sd->parent_snaps = fnvlist_alloc(); sd->snapprops = fnvlist_alloc(); if (sd->holds) sd->snapholds = fnvlist_alloc(); /* * If this is a "doall" send, a replicate send or we're just trying * to gather a list of previous snapshots, iterate through all the * snaps in the txg range. Otherwise just look at the one we're * interested in. */ if (sd->doall || sd->replicate || sd->tosnap == NULL) { if (!sd->replicate && fromsnap_txg != 0) min_txg = fromsnap_txg; if (!sd->replicate && tosnap_txg != 0) max_txg = tosnap_txg; (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sd->tosnap); if (sd->fromsnap != NULL) sd->seenfrom = B_TRUE; snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) (void) send_iterate_snap(snap, sd); } fnvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps); fnvlist_add_nvlist(nvfs, "snapprops", sd->snapprops); if (sd->holds) fnvlist_add_nvlist(nvfs, "snapholds", sd->snapholds); fnvlist_free(sd->parent_snaps); fnvlist_free(sd->snapprops); fnvlist_free(sd->snapholds); /* Do not allow the size of the properties list to exceed the limit */ if ((fnvlist_size(nvfs) + fnvlist_size(sd->fss)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "warning: cannot send %s@%s: the size of the list of " "snapshots and properties is too large to be received " "successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name, sd->tosnap); rv = EZFS_NOSPC; goto out; } /* add this fs to nvlist */ (void) snprintf(guidstring, sizeof (guidstring), "0x%llx", (longlong_t)guid); fnvlist_add_nvlist(sd->fss, guidstring, nvfs); /* iterate over children */ if (sd->recursive) rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); out: sd->parent_fromsnap_guid = parent_fromsnap_guid_save; sd->fromsnap_txg = fromsnap_txg_save; sd->tosnap_txg = tosnap_txg_save; fnvlist_free(nv); fnvlist_free(nvfs); zfs_close(zhp); return (rv); } static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, const char *tosnap, boolean_t recursive, boolean_t raw, boolean_t doall, boolean_t replicate, boolean_t skipmissing, boolean_t verbose, boolean_t backup, boolean_t holds, boolean_t props, nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; int error; zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (EZFS_BADTYPE); sd.fss = fnvlist_alloc(); sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; sd.recursive = recursive; sd.raw = raw; sd.doall = doall; sd.replicate = replicate; sd.skipmissing = skipmissing; sd.verbose = verbose; sd.backup = backup; sd.holds = holds; sd.props = props; if ((error = send_iterate_fs(zhp, &sd)) != 0) { fnvlist_free(sd.fss); if (avlp != NULL) *avlp = NULL; *nvlp = NULL; return (error); } if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { fnvlist_free(sd.fss); *nvlp = NULL; return (EZFS_NOMEM); } *nvlp = sd.fss; return (0); } /* * Routines specific to "zfs send" */ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t dryrun, parsable, progress, embed_data, std_out; boolean_t large_block, compress, raw, holds; int outfd; boolean_t err; nvlist_t *fss; nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; int verbosity; uint64_t size; } send_dump_data_t; static int zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t *spacep) { libzfs_handle_t *hdl = zhp->zfs_hdl; int error; assert(snapname != NULL); error = lzc_send_space(snapname, from, flags, spacep); if (error != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot estimate space for '%s'"), snapname); switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, snapname, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), snapname); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", strerror(error)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, error, errbuf)); } } return (0); } /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, boolean_t fromorigin, int outfd, enum lzc_send_flags flags, nvlist_t *debugnv) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = outfd; zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_flags = flags; thisdbg = fnvlist_alloc(); if (fromsnap && fromsnap[0] != '\0') { fnvlist_add_string(thisdbg, "fromsnap", fromsnap); } if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); fnvlist_add_uint64(thisdbg, "error", errno); if (debugnv) { fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); } fnvlist_free(thisdbg); switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } if (debugnv) fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); fnvlist_free(thisdbg); return (0); } static void gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) { assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); /* * zfs_send() only sets snapholds for sends that need them, * e.g. replication and doall. */ if (sdd->snapholds == NULL) return; fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } int zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, uint64_t *blocks_visited) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = fd; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) return (errno); if (bytes_written != NULL) *bytes_written = zc.zc_cookie; if (blocks_visited != NULL) *blocks_visited = zc.zc_objset_type; return (0); } static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; zfs_handle_t *zhp = pa->pa_zhp; uint64_t bytes; uint64_t blocks; char buf[16]; time_t t; struct tm *tm; boolean_t firstloop = B_TRUE; /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { int err; (void) sleep(1); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) return ((void *)0); return ((void *)(uintptr_t)err); } if (firstloop && !pa->pa_parsable) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", pa->pa_estimate ? "BYTES" : " SENT", pa->pa_verbosity >= 2 ? " BLOCKS " : "", zhp->zfs_name); firstloop = B_FALSE; } (void) time(&t); tm = localtime(&t); if (pa->pa_verbosity >= 2 && pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%llu\t%s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, (u_longlong_t)bytes, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_verbosity >= 2) { zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %8llu %s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, buf, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, (u_longlong_t)bytes, zhp->zfs_name); } else { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, buf, zhp->zfs_name); } } } static void send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, uint64_t size, boolean_t parsable) { if (parsable) { if (fromsnap != NULL) { (void) fprintf(fout, "incremental\t%s\t%s", fromsnap, tosnap); } else { (void) fprintf(fout, "full\t%s", tosnap); } } else { if (fromsnap != NULL) { if (strchr(fromsnap, '@') == NULL && strchr(fromsnap, '#') == NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from @%s to %s"), fromsnap, tosnap); } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from %s to %s"), fromsnap, tosnap); } } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full send of %s"), tosnap); } } if (parsable) { (void) fprintf(fout, "\t%llu", (longlong_t)size); } else if (size != 0) { char buf[16]; zfs_nicebytes(size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, " estimated size is %s"), buf); } (void) fprintf(fout, "\n"); } static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; FILE *fout = sdd->std_out ? stdout : stderr; err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; isfromsnap = (sdd->fromsnap != NULL && strcmp(sdd->fromsnap, thissnap) == 0); if (!sdd->seenfrom && isfromsnap) { gather_holds(zhp, sdd); sdd->seenfrom = B_TRUE; (void) strlcpy(sdd->prevsnap, thissnap, sizeof (sdd->prevsnap)); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (0); } if (sdd->seento || !sdd->seenfrom) { zfs_close(zhp); return (0); } istosnap = (strcmp(sdd->tosnap, thissnap) == 0); if (istosnap) sdd->seento = B_TRUE; if (sdd->large_block) flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (sdd->embed_data) flags |= LZC_SEND_FLAG_EMBED_DATA; if (sdd->compress) flags |= LZC_SEND_FLAG_COMPRESS; if (sdd->raw) flags |= LZC_SEND_FLAG_RAW; if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { char *snapname; nvlist_t *snapprops; /* * Filter out all intermediate snapshots except origin * snapshots needed to replicate clones. */ nvlist_t *nvfs = fsavl_find(sdd->fsavl, zhp->zfs_dmustats.dds_guid, &snapname); - snapprops = fnvlist_lookup_nvlist(nvfs, "snapprops"); - snapprops = fnvlist_lookup_nvlist(snapprops, thissnap); - exclude = !nvlist_exists(snapprops, "is_clone_origin"); + if (nvfs != NULL) { + snapprops = fnvlist_lookup_nvlist(nvfs, + "snapprops"); + snapprops = fnvlist_lookup_nvlist(snapprops, + thissnap); + exclude = !nvlist_exists(snapprops, + "is_clone_origin"); + } } else { exclude = B_TRUE; } } /* * If a filter function exists, call it to determine whether * this snapshot will be sent. */ if (exclude || (sdd->filter_cb != NULL && sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { /* * This snapshot is filtered out. Don't send it, and don't * set prevsnap_obj, so it will be as if this snapshot didn't * exist, and the next accepted snapshot will be sent as * an incremental from the last accepted one, or as the * first (and full) snapshot in the case of a replication, * non-incremental send. */ zfs_close(zhp); return (0); } gather_holds(zhp, sdd); fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); if (sdd->verbosity != 0) { uint64_t size = 0; char fromds[ZFS_MAX_DATASET_NAME_LEN]; if (sdd->prevsnap[0] != '\0') { (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds)); *(strchr(fromds, '@') + 1) = '\0'; (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds)); } if (zfs_send_space(zhp, zhp->zfs_name, sdd->prevsnap[0] ? fromds : NULL, flags, &size) != 0) { size = 0; /* cannot estimate send space */ } else { send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, size, sdd->parsable); } sdd->size += size; } if (!sdd->dryrun) { /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (sdd->progress) { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = sdd->verbosity; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { zfs_close(zhp); return (err); } } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); if (sdd->progress) { void *status = NULL; (void) pthread_cancel(tid); (void) pthread_join(tid, &status); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "progress thread exited nonzero")); return (zfs_standard_error(zhp->zfs_hdl, error, errbuf)); } } } (void) strcpy(sdd->prevsnap, thissnap); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (err); } static int dump_filesystem(zfs_handle_t *zhp, void *arg) { int rv = 0; send_dump_data_t *sdd = arg; boolean_t missingfrom = B_FALSE; zfs_cmd_t zc = {"\0"}; uint64_t min_txg = 0, max_txg = 0; (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); } if (sdd->replicate && sdd->fromsnap) { /* * If this fs does not have fromsnap, and we're doing * recursive, we need to send a full stream from the * beginning (or an incremental from the origin if this * is a clone). If we're doing non-recursive, then let * them get the error. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->fromsnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { missingfrom = B_TRUE; } } sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; sdd->prevsnap_obj = 0; if (sdd->fromsnap == NULL || missingfrom) sdd->seenfrom = B_TRUE; /* * Iterate through all snapshots and process the ones we will be * sending. If we only have a "from" and "to" snapshot to deal * with, we can avoid iterating through all the other snapshots. */ if (sdd->doall || sdd->replicate || sdd->tosnap == NULL) { if (!sdd->replicate && sdd->fromsnap != NULL) min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->fromsnap); if (!sdd->replicate && sdd->tosnap != NULL) max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->tosnap); rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; if (!sdd->seenfrom) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->fromsnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = -1; } if (rv == 0) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->tosnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = -1; } } if (!sdd->seenfrom) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) does not exist\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { if (sdd->fromsnap) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) " "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: " "could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); } sdd->err = B_TRUE; } return (rv); } static int dump_filesystems(zfs_handle_t *rzhp, void *arg) { send_dump_data_t *sdd = arg; nvpair_t *fspair; boolean_t needagain, progress; if (!sdd->replicate) return (dump_filesystem(rzhp, sdd)); /* Mark the clone origin snapshots. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *nvfs; uint64_t origin_guid = 0; nvfs = fnvpair_value_nvlist(fspair); (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); if (origin_guid != 0) { char *snapname; nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, &snapname); if (origin_nv != NULL) { nvlist_t *snapprops; snapprops = fnvlist_lookup_nvlist(origin_nv, "snapprops"); snapprops = fnvlist_lookup_nvlist(snapprops, snapname); fnvlist_add_boolean(snapprops, "is_clone_origin"); } } } again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist, *parent_nv; char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; uint64_t parent_guid = 0; fslist = fnvpair_value_nvlist(fspair); if (nvlist_lookup_boolean(fslist, "sent") == 0) continue; fsname = fnvlist_lookup_string(fslist, "name"); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); (void) nvlist_lookup_uint64(fslist, "parentfromsnap", &parent_guid); if (parent_guid != 0) { parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); if (!nvlist_exists(parent_nv, "sent")) { /* parent has not been sent; skip this one */ needagain = B_TRUE; continue; } } if (origin_guid != 0) { nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); if (origin_nv != NULL && !nvlist_exists(origin_nv, "sent")) { /* * origin has not been sent yet; * skip this clone. */ needagain = B_TRUE; continue; } } zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); err = dump_filesystem(zhp, sdd); fnvlist_add_boolean(fslist, "sent"); progress = B_TRUE; zfs_close(zhp); if (err) return (err); } if (needagain) { assert(progress); goto again; } /* clean out the sent flags in case we reuse this fss */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist; fslist = fnvpair_value_nvlist(fspair); (void) nvlist_remove_all(fslist, "sent"); } return (0); } nvlist_t * zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) { unsigned int version; int nread, i; unsigned long long checksum, packed_len; /* * Decode token header, which is: * -- * Note that the only supported token version is 1. */ nread = sscanf(token, "%u-%llx-%llx-", &version, &checksum, &packed_len); if (nread != 3) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid format)")); return (NULL); } if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid version %u)"), version); return (NULL); } /* convert hexadecimal representation to binary */ token = strrchr(token, '-') + 1; int len = strlen(token) / 2; unsigned char *compressed = zfs_alloc(hdl, len); for (i = 0; i < len; i++) { nread = sscanf(token + i * 2, "%2hhx", compressed + i); if (nread != 1) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt " "(payload is not hex-encoded)")); return (NULL); } } /* verify checksum */ zio_cksum_t cksum; fletcher_4_native_varsize(compressed, len, &cksum); if (cksum.zc_word[0] != checksum) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (incorrect checksum)")); return (NULL); } /* uncompress */ void *packed = zfs_alloc(hdl, packed_len); uLongf packed_len_long = packed_len; if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || packed_len_long != packed_len) { free(packed); free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (decompression failed)")); return (NULL); } /* unpack nvlist */ nvlist_t *nv; int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); free(packed); free(compressed); if (error != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (nvlist_unpack failed)")); return (NULL); } return (nv); } static enum lzc_send_flags lzc_flags_from_sendflags(const sendflags_t *flags) { enum lzc_send_flags lzc_flags = 0; if (flags->largeblock) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags->compress) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (flags->raw) lzc_flags |= LZC_SEND_FLAG_RAW; if (flags->saved) lzc_flags |= LZC_SEND_FLAG_SAVED; return (lzc_flags); } static int estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes, const char *redactbook, char *errbuf) { uint64_t size; FILE *fout = flags->dryrun ? stdout : stderr; progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_TRUE; pa.pa_verbosity = flags->verbosity; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes, redactbook, fd, &size); if (flags->progress) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "progress thread exited " "nonzero")); return (zfs_standard_error(zhp->zfs_hdl, error, errbuf)); } } if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } send_print_verbose(fout, zhp->zfs_name, from, size, flags->parsable); if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)size); } else { char buf[16]; zfs_nicenum(size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } return (0); } static boolean_t redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } static boolean_t redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1, const uint64_t *snaps2, uint64_t num_snaps2) { if (num_snaps1 != num_snaps2) return (B_FALSE); for (int i = 0; i < num_snaps1; i++) { if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i])) return (B_FALSE); } return (B_TRUE); } /* * Check that the list of redaction snapshots in the bookmark matches the send * we're resuming, and return whether or not it's complete. * * Note that the caller needs to free the contents of *bookname with free() if * this function returns successfully. */ static int find_redact_book(libzfs_handle_t *hdl, const char *path, const uint64_t *redact_snap_guids, int num_redact_snaps, char **bookname) { char errbuf[1024]; int error = 0; nvlist_t *props = fnvlist_alloc(); nvlist_t *bmarks; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); fnvlist_add_boolean(props, "redact_complete"); fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); error = lzc_get_bookmarks(path, props, &bmarks); fnvlist_free(props); if (error != 0) { if (error == ESRCH) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nonexistent redaction bookmark provided")); } else if (error == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset to be sent no longer exists")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unknown error: %s"), strerror(error)); } return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } nvpair_t *pair; for (pair = nvlist_next_nvpair(bmarks, NULL); pair; pair = nvlist_next_nvpair(bmarks, pair)) { nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); uint_t len = 0; uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist, ZPROP_VALUE, &len); if (redact_snaps_equal(redact_snap_guids, num_redact_snaps, bmarksnaps, len)) { break; } } if (pair == NULL) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no appropriate redaction bookmark exists")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } char *name = nvpair_name(pair); nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete"); boolean_t complete = fnvlist_lookup_boolean_value(vallist, ZPROP_VALUE); if (!complete) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incomplete redaction bookmark provided")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } *bookname = strndup(name, ZFS_MAX_DATASET_NAME_LEN); ASSERT3P(*bookname, !=, NULL); fnvlist_free(bmarks); return (0); } static int zfs_send_resume_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, nvlist_t *resume_nvl) { char errbuf[1024]; char *toname; char *fromname = NULL; uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; zfs_handle_t *zhp; int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; enum lzc_send_flags lzc_flags = 0; FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr; uint64_t *redact_snap_guids = NULL; int num_redact_snaps = 0; char *redact_book = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); if (flags->verbosity != 0) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); } if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt")); return (zfs_error(hdl, EZFS_FAULT, errbuf)); } fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok")) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags->compress || nvlist_exists(resume_nvl, "compressok")) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (flags->raw || nvlist_exists(resume_nvl, "rawok")) lzc_flags |= LZC_SEND_FLAG_RAW; if (flags->saved || nvlist_exists(resume_nvl, "savedok")) lzc_flags |= LZC_SEND_FLAG_SAVED; if (flags->saved) { (void) strcpy(name, toname); } else { error = guid_to_name(hdl, toname, toguid, B_FALSE, name); if (error != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is no longer the same snapshot " "used in the initial send"), toname); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' used in the initial send no " "longer exists"), toname); } return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } } zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to access '%s'"), name); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps", &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) { num_redact_snaps = -1; } if (fromguid != 0) { if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE, redact_snap_guids, num_redact_snaps, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } fromname = name; } redact_snap_guids = NULL; if (nvlist_lookup_uint64_array(resume_nvl, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids, (uint_t *)&num_redact_snaps) == 0) { char path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(path, toname, sizeof (path)); char *at = strchr(path, '@'); ASSERT3P(at, !=, NULL); *at = '\0'; if ((error = find_redact_book(hdl, path, redact_snap_guids, num_redact_snaps, &redact_book)) != 0) { return (error); } } if (flags->verbosity != 0) { /* * Some of these may have come from the resume token, set them * here for size estimate purposes. */ sendflags_t tmpflags = *flags; if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK) tmpflags.largeblock = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_COMPRESS) tmpflags.compress = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA) tmpflags.embed_data = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_RAW) tmpflags.raw = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_SAVED) tmpflags.saved = B_TRUE; error = estimate_size(zhp, fromname, outfd, &tmpflags, resumeobj, resumeoff, bytes, redact_book, errbuf); } if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { if (redact_book != NULL) free(redact_book); zfs_close(zhp); return (error); } } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, lzc_flags, resumeobj, resumeoff, redact_book); if (redact_book != NULL) free(redact_book); if (flags->progress) { void *status = NULL; (void) pthread_cancel(tid); (void) pthread_join(tid, &status); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "progress thread exited nonzero")); return (zfs_standard_error(hdl, error, errbuf)); } } char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); zfs_close(zhp); switch (error) { case 0: return (0); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ESRCH: if (lzc_exists(zhp->zfs_name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source could not be found")); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EXDEV: case ENOENT: case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } else { if (redact_book != NULL) free(redact_book); } zfs_close(zhp); return (error); } int zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, const char *resume_token) { int ret; char errbuf[1024]; nvlist_t *resume_nvl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ return (zfs_error(hdl, EZFS_FAULT, errbuf)); } ret = zfs_send_resume_impl(hdl, flags, outfd, resume_nvl); fnvlist_free(resume_nvl); return (ret); } int zfs_send_saved(zfs_handle_t *zhp, sendflags_t *flags, int outfd, const char *resume_token) { int ret; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *saved_nvl = NULL, *resume_nvl = NULL; uint64_t saved_guid = 0, resume_guid = 0; uint64_t obj = 0, off = 0, bytes = 0; char token_buf[ZFS_MAXPROPLEN]; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "saved send failed")); ret = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (ret != 0) goto out; saved_nvl = zfs_send_resume_token_to_nvlist(hdl, token_buf); if (saved_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } /* * If a resume token is provided we use the object and offset * from that instead of the default, which starts from the * beginning. */ if (resume_token != NULL) { resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(resume_nvl, "object", &obj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &off) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &resume_guid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(saved_nvl, "toguid", &saved_guid)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset's resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (resume_guid != saved_guid) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token does not match dataset")); ret = zfs_error(hdl, EZFS_BADBACKUP, errbuf); goto out; } } (void) nvlist_remove_all(saved_nvl, "object"); fnvlist_add_uint64(saved_nvl, "object", obj); (void) nvlist_remove_all(saved_nvl, "offset"); fnvlist_add_uint64(saved_nvl, "offset", off); (void) nvlist_remove_all(saved_nvl, "bytes"); fnvlist_add_uint64(saved_nvl, "bytes", bytes); (void) nvlist_remove_all(saved_nvl, "toname"); fnvlist_add_string(saved_nvl, "toname", zhp->zfs_name); ret = zfs_send_resume_impl(hdl, flags, outfd, saved_nvl); out: fnvlist_free(saved_nvl); fnvlist_free(resume_nvl); return (ret); } /* * This function informs the target system that the recursive send is complete. * The record is also expected in the case of a send -p. */ static int send_conclusion_record(int fd, zio_cksum_t *zc) { dmu_replay_record_t drr = { 0 }; drr.drr_type = DRR_END; if (zc != NULL) drr.drr_u.drr_end.drr_checksum = *zc; if (write(fd, &drr, sizeof (drr)) == -1) { return (errno); } return (0); } /* * This function is responsible for sending the records that contain the * necessary information for the target system's libzfs to be able to set the * properties of the filesystem being received, or to be able to prepare for * a recursive receive. * * The "zhp" argument is the handle of the snapshot we are sending * (the "tosnap"). The "from" argument is the short snapshot name (the part * after the @) of the incremental source. */ static int send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, boolean_t gather_props, boolean_t recursive, boolean_t verbose, boolean_t dryrun, boolean_t raw, boolean_t replicate, boolean_t skipmissing, boolean_t backup, boolean_t holds, boolean_t props, boolean_t doall, nvlist_t **fssp, avl_tree_t **fsavlp) { int err = 0; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { {0} }; int featureflags = 0; /* name of filesystem/volume that contains snapshot we are sending */ char tofs[ZFS_MAX_DATASET_NAME_LEN]; /* short name of snap we are sending */ char *tosnap = ""; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp, ZFS_PROP_VERSION) >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } if (holds) featureflags |= DMU_BACKUP_FEATURE_HOLDS; (void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN); char *at = strchr(tofs, '@'); if (at != NULL) { *at = '\0'; tosnap = at + 1; } if (gather_props) { nvlist_t *hdrnv = fnvlist_alloc(); nvlist_t *fss = NULL; if (from != NULL) fnvlist_add_string(hdrnv, "fromsnap", from); fnvlist_add_string(hdrnv, "tosnap", tosnap); if (!recursive) fnvlist_add_boolean(hdrnv, "not_recursive"); if (raw) { fnvlist_add_boolean(hdrnv, "raw"); } if ((err = gather_nvlist(zhp->zfs_hdl, tofs, from, tosnap, recursive, raw, doall, replicate, skipmissing, verbose, backup, holds, props, &fss, fsavlp)) != 0) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } /* * Do not allow the size of the properties list to exceed * the limit */ if ((fnvlist_size(fss) + fnvlist_size(hdrnv)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s': " "the size of the list of snapshots and properties " "is too large to be received successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name); return (zfs_error(zhp->zfs_hdl, EZFS_NOSPC, errbuf)); } fnvlist_add_nvlist(hdrnv, "fss", fss); VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0)); if (fssp != NULL) { *fssp = fss; } else { fnvlist_free(fss); } fnvlist_free(hdrnv); } if (!dryrun) { dmu_replay_record_t drr = { 0 }; /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. drr_versioninfo, DMU_COMPOUNDSTREAM); DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. drr_versioninfo, featureflags); if (snprintf(drr.drr_u.drr_begin.drr_toname, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs, tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } drr.drr_payloadlen = buflen; err = dump_record(&drr, packbuf, buflen, &zc, fd); free(packbuf); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } err = send_conclusion_record(fd, &zc); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } } return (0); } /* * Generate a send stream. The "zhp" argument is the filesystem/volume * that contains the snapshot to send. The "fromsnap" argument is the * short name (the part after the '@') of the snapshot that is the * incremental source to send from (if non-NULL). The "tosnap" argument * is the short name of the snapshot to send. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: * - from the snapshot identified by "fromsnap" (if non-null) or * - from the origin of the dataset identified by zhp, which must * be a clone. In this case, "fromsnap" is null and "fromorigin" * is TRUE. * * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) * if "replicate" is set. If "doall" is set, dump all the intermediate * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" * case too. If "props" is set, send properties. */ int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { char errbuf[1024]; send_dump_data_t sdd = { 0 }; int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); if (fromsnap && fromsnap[0] == '\0') { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "zero-length incremental source")); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } + if (fromsnap) { + char full_fromsnap_name[ZFS_MAX_DATASET_NAME_LEN]; + if (snprintf(full_fromsnap_name, sizeof (full_fromsnap_name), + "%s@%s", zhp->zfs_name, fromsnap) >= + sizeof (full_fromsnap_name)) { + err = EINVAL; + goto stderr_out; + } + zfs_handle_t *fromsnapn = zfs_open(zhp->zfs_hdl, + full_fromsnap_name, ZFS_TYPE_SNAPSHOT); + if (fromsnapn == NULL) { + err = -1; + goto err_out; + } + zfs_close(fromsnapn); + } + if (flags->replicate || flags->doall || flags->props || flags->holds || flags->backup) { char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN]; if (snprintf(full_tosnap_name, sizeof (full_tosnap_name), "%s@%s", zhp->zfs_name, tosnap) >= sizeof (full_tosnap_name)) { err = EINVAL; goto stderr_out; } zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl, full_tosnap_name, ZFS_TYPE_SNAPSHOT); if (tosnap == NULL) { err = -1; goto err_out; } err = send_prelim_records(tosnap, fromsnap, outfd, flags->replicate || flags->props || flags->holds, flags->replicate, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, flags->skipmissing, flags->backup, flags->holds, flags->props, flags->doall, &fss, &fsavl); zfs_close(tosnap); if (err != 0) goto err_out; } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; sdd.outfd = outfd; sdd.replicate = flags->replicate; sdd.doall = flags->doall; sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; sdd.verbosity = flags->verbosity; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; sdd.compress = flags->compress; sdd.raw = flags->raw; sdd.holds = flags->holds; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; if (sdd.verbosity != 0 && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; /* * Some flags require that we place user holds on the datasets that are * being sent so they don't get destroyed during the send. We can skip * this step if the pool is imported read-only since the datasets cannot * be destroyed. */ if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), ZPOOL_PROP_READONLY, NULL) && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS && (flags->doall || flags->replicate)) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); sdd.cleanup_fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); if (sdd.cleanup_fd < 0) { err = errno; goto stderr_out; } sdd.snapholds = fnvlist_alloc(); } else { sdd.cleanup_fd = -1; sdd.snapholds = NULL; } if (flags->verbosity != 0 || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, * then do a non-verbose real run to generate the streams. */ sdd.dryrun = B_TRUE; err = dump_filesystems(zhp, &sdd); if (err != 0) goto stderr_out; if (flags->verbosity != 0) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); } else { char buf[16]; zfs_nicebytes(sdd.size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } } /* Ensure no snaps found is treated as an error. */ if (!sdd.seento) { err = ENOENT; goto err_out; } /* Skip the second run if dryrun was requested. */ if (flags->dryrun) goto err_out; if (sdd.snapholds != NULL) { err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); if (err != 0) goto stderr_out; fnvlist_free(sdd.snapholds); sdd.snapholds = NULL; } sdd.dryrun = B_FALSE; sdd.verbosity = 0; } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); fnvlist_free(fss); /* Ensure no snaps found is treated as an error. */ if (err == 0 && !sdd.seento) err = ENOENT; if (sdd.cleanup_fd != -1) { VERIFY(0 == close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; } if (!flags->dryrun && (flags->replicate || flags->doall || flags->props || flags->backup || flags->holds)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally * failed. */ err = send_conclusion_record(outfd, NULL); if (err != 0) return (zfs_standard_error(zhp->zfs_hdl, err, errbuf)); } return (err || sdd.err); stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: fsavl_destroy(fsavl); fnvlist_free(fss); fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); return (err); } static zfs_handle_t * name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname) { char dirname[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN); char *c = strchr(dirname, '@'); if (c != NULL) *c = '\0'; return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET)); } /* * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either * an earlier snapshot in the same filesystem, or a snapshot before later's * origin, or it's origin's origin, etc. */ static boolean_t snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later) { boolean_t ret; uint64_t later_txg = (later->zfs_type == ZFS_TYPE_FILESYSTEM || later->zfs_type == ZFS_TYPE_VOLUME ? UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG)); uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG); if (earlier_txg >= later_txg) return (B_FALSE); zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl, earlier->zfs_name); zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl, later->zfs_name); if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_TRUE); } char clonename[ZFS_MAX_DATASET_NAME_LEN]; if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename, ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_FALSE); } zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename, ZFS_TYPE_DATASET); uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG); /* * If "earlier" is exactly the origin, then * snapshot_is_before(earlier, origin) will return false (because * they're the same). */ if (origin_txg == earlier_txg && strcmp(origin->zfs_name, earlier->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); zfs_close(origin); return (B_TRUE); } zfs_close(earlier_dir); zfs_close(later_dir); ret = snapshot_is_before(earlier, origin); zfs_close(origin); return (ret); } /* * The "zhp" argument is the handle of the dataset to send (typically a * snapshot). The "from" argument is the full name of the snapshot or * bookmark that is the incremental source. */ int zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, const char *redactbook) { int err; libzfs_handle_t *hdl = zhp->zfs_hdl; char *name = zhp->zfs_name; pthread_t ptid; progress_arg_t pa = { 0 }; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), name); if (from != NULL && strchr(from, '@')) { zfs_handle_t *from_zhp = zfs_open(hdl, from, ZFS_TYPE_DATASET); if (from_zhp == NULL) return (-1); if (!snapshot_is_before(from_zhp, zhp)) { zfs_close(from_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } zfs_close(from_zhp); } if (redactbook != NULL) { char bookname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *redact_snaps; zfs_handle_t *book_zhp; char *at, *pound; int dsnamelen; pound = strchr(redactbook, '#'); if (pound != NULL) redactbook = pound + 1; at = strchr(name, '@'); if (at == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot do a redacted send to a filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } dsnamelen = at - name; if (snprintf(bookname, sizeof (bookname), "%.*s#%s", dsnamelen, name, redactbook) >= sizeof (bookname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid bookmark name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } book_zhp = zfs_open(hdl, bookname, ZFS_TYPE_BOOKMARK); if (book_zhp == NULL) return (-1); if (nvlist_lookup_nvlist(book_zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snaps) != 0 || redact_snaps == NULL) { zfs_close(book_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a redaction bookmark")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } zfs_close(book_zhp); } /* * Send fs properties */ if (flags->props || flags->holds || flags->backup) { /* * Note: the header generated by send_prelim_records() * assumes that the incremental source is in the same * filesystem/volume as the target (which is a requirement * when doing "zfs send -R"). But that isn't always the * case here (e.g. send from snap in origin, or send from * bookmark). We pass from=NULL, which will omit this * information from the prelim records; it isn't used * when receiving this type of stream. */ err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, B_FALSE, flags->backup, flags->holds, flags->props, flags->doall, NULL, NULL); if (err != 0) return (err); } /* * Perform size estimate if verbose was specified. */ if (flags->verbosity != 0) { err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook, errbuf); if (err != 0) return (err); } if (flags->dryrun) return (0); /* * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); if (flags->progress) { void *status = NULL; if (err != 0) (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error_fmt(hdl, error, dgettext(TEXT_DOMAIN, "progress thread exited nonzero"))); } if (flags->props || flags->holds || flags->backup) { /* Write the final end record. */ err = send_conclusion_record(fd, NULL); if (err != 0) return (zfs_standard_error(hdl, err, errbuf)); } if (err != 0) { switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: case ESRCH: if (lzc_exists(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), from); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "target is busy; if a filesystem, " "it must not be mounted")); return (zfs_error(hdl, EZFS_BUSY, errbuf)); case EDQUOT: case EFAULT: case EFBIG: case EINVAL: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EROFS: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (err != 0); } /* * Routines specific to "zfs recv" */ static int recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, boolean_t byteswap, zio_cksum_t *zc) { char *cp = buf; int rv; int len = ilen; do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to read from stream")); return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, "cannot receive"))); } if (zc) { if (byteswap) fletcher_4_incremental_byteswap(buf, ilen, zc); else fletcher_4_incremental_native(buf, ilen, zc); } return (0); } static int recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, boolean_t byteswap, zio_cksum_t *zc) { char *buf; int err; buf = zfs_alloc(hdl, len); if (buf == NULL) return (ENOMEM); if (len > hdl->libzfs_max_nvlist) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nvlist too large")); free(buf); return (ENOMEM); } err = recv_read(hdl, fd, buf, len, byteswap, zc); if (err != 0) { free(buf); return (err); } err = nvlist_unpack(buf, len, nvp, 0); free(buf); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (malformed nvlist)")); return (EINVAL); } return (0); } /* * Returns the grand origin (origin of origin of origin...) of a given handle. * If this dataset is not a clone, it simply returns a copy of the original * handle. */ static zfs_handle_t * recv_open_grand_origin(zfs_handle_t *zhp) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; zfs_handle_t *ozhp = zfs_handle_dup(zhp); while (ozhp != NULL) { if (zfs_prop_get(ozhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE) != 0) break; (void) zfs_close(ozhp); ozhp = zfs_open(zhp->zfs_hdl, origin, ZFS_TYPE_FILESYSTEM); } return (ozhp); } static int recv_rename_impl(zfs_handle_t *zhp, const char *name, const char *newname) { int err; zfs_handle_t *ozhp = NULL; /* * Attempt to rename the dataset. If it fails with EACCES we have * attempted to rename the dataset outside of its encryption root. * Force the dataset to become an encryption root and try again. */ err = lzc_rename(name, newname); if (err == EACCES) { ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = ENOENT; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = lzc_rename(name, newname); } out: if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, int baselen, char *newname, recvflags_t *flags) { static int seq; int err; prop_changelist_t *clp = NULL; zfs_handle_t *zhp = NULL; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); if (clp == NULL) { err = -1; goto out; } err = changelist_prefix(clp); if (err) goto out; if (tryname) { (void) strcpy(newname, tryname); if (flags->verbose) { (void) printf("attempting rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, tryname); } else { err = ENOENT; } if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { seq++; (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%.*srecv-%u-%u", baselen, name, getpid(), seq); if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, newname); if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else (void) printf("failed (%u)\n", errno); } (void) changelist_postfix(clp); out: if (clp != NULL) changelist_free(clp); if (zhp != NULL) zfs_close(zhp); return (err); } static int recv_promote(libzfs_handle_t *hdl, const char *fsname, const char *origin_fsname, recvflags_t *flags) { int err; zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = NULL, *ozhp = NULL; if (flags->verbose) (void) printf("promoting %s\n", fsname); (void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); /* * Attempt to promote the dataset. If it fails with EACCES the * promotion would cause this dataset to leave its encryption root. * Force the origin to become an encryption root and try again. */ err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); if (err == EACCES) { zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = -1; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); } out: if (zhp != NULL) zfs_close(zhp); if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, char *newname, recvflags_t *flags) { int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; boolean_t defer = B_FALSE; int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) defer = B_TRUE; zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (flags->verbose) (void) printf("attempting destroy %s\n", name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, name); err = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { err = lzc_destroy(name); } if (err == 0) { if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, name); } (void) changelist_postfix(clp); changelist_free(clp); /* * Deferred destroy might destroy the snapshot or only mark it to be * destroyed later, and it returns success in either case. */ if (err != 0 || (defer && zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT))) { err = recv_rename(hdl, name, NULL, baselen, newname, flags); } return (err); } typedef struct guid_to_name_data { uint64_t guid; boolean_t bookmark_ok; char *name; char *skip; uint64_t *redact_snap_guids; uint64_t num_redact_snaps; } guid_to_name_data_t; static boolean_t redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd) { uint64_t *bmark_snaps; uint_t bmark_num_snaps; nvlist_t *nvl; if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) return (B_FALSE); nvl = fnvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE, &bmark_num_snaps); if (bmark_num_snaps != gtnd->num_redact_snaps) return (B_FALSE); int i = 0; for (; i < bmark_num_snaps; i++) { int j = 0; for (; j < bmark_num_snaps; j++) { if (bmark_snaps[i] == gtnd->redact_snap_guids[j]) break; } if (j == bmark_num_snaps) break; } return (i == bmark_num_snaps); } static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; const char *slash; int err; if (gtnd->skip != NULL && (slash = strrchr(zhp->zfs_name, '/')) != NULL && strcmp(slash + 1, gtnd->skip) == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid && (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); if (err != EEXIST && gtnd->bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } /* * Attempt to find the local dataset associated with this guid. In the case of * multiple matches, we attempt to find the "best" match by searching * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. * * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with * the specified number of redaction snapshots. If num_redact_snaps isn't 0 or * -1, then redact_snap_guids will be an array of the guids of the snapshots the * redaction bookmark was created with. If num_redact_snaps is -1, then we will * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the * given guid. Note that a redaction bookmark can be returned if * num_redact_snaps == -1. */ static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; gtnd.guid = guid; gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; gtnd.redact_snap_guids = redact_snap_guids; gtnd.num_redact_snaps = num_redact_snaps; /* * Search progressively larger portions of the hierarchy, starting * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ (void) strlcpy(pname, parent, sizeof (pname)); char *cp = strrchr(pname, '@'); if (cp == NULL) cp = strchr(pname, '\0'); for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); if (err != EEXIST) err = zfs_iter_children(zhp, guid_to_name_cb, >nd); if (err != EEXIST && bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* * Remember the last portion of the dataset so we skip it next * time through (as we've already searched that portion of the * hierarchy). */ gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); } static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, char *name) { return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL, -1, name)); } /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, uint64_t guid1, uint64_t guid2) { nvlist_t *nvfs; char *fsname = NULL, *snapname = NULL; char buf[ZFS_MAX_DATASET_NAME_LEN]; int rv; zfs_handle_t *guid1hdl, *guid2hdl; uint64_t create1, create2; if (guid2 == 0) return (0); if (guid1 == 0) return (1); nvfs = fsavl_find(avl, guid1, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid2hdl == NULL) { zfs_close(guid1hdl); return (-1); } create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); if (create1 < create2) rv = -1; else if (create1 > create2) rv = +1; else rv = 0; zfs_close(guid1hdl); zfs_close(guid2hdl); return (rv); } /* * This function reestablishes the hierarchy of encryption roots after a * recursive incremental receive has completed. This must be done after the * second call to recv_incremental_replication() has renamed and promoted all * sent datasets to their final locations in the dataset hierarchy. */ static int recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, nvlist_t *stream_nv, avl_tree_t *stream_avl) { int err; nvpair_t *fselem = NULL; nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) { zfs_handle_t *zhp = NULL; uint64_t crypt; nvlist_t *snaps, *props, *stream_nvfs = NULL; nvpair_t *snapel = NULL; boolean_t is_encroot, is_clone, stream_encroot; char *cp; char *stream_keylocation = NULL; char keylocation[MAXNAMELEN]; char fsname[ZFS_MAX_DATASET_NAME_LEN]; keylocation[0] = '\0'; stream_nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(stream_nvfs, "snaps"); props = fnvlist_lookup_nvlist(stream_nvfs, "props"); stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); /* find a snapshot from the stream that exists locally */ err = ENOENT; while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) { uint64_t guid; guid = fnvpair_value_uint64(snapel); err = guid_to_name(hdl, top_zfs, guid, B_FALSE, fsname); if (err == 0) break; } if (err != 0) continue; cp = strchr(fsname, '@'); if (cp != NULL) *cp = '\0'; zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = ENOENT; goto error; } crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); /* we don't need to do anything for unencrypted datasets */ if (crypt == ZIO_CRYPT_OFF) { zfs_close(zhp); continue; } /* * If the dataset is flagged as an encryption root, was not * received as a clone and is not currently an encryption root, * force it to become one. Fixup the keylocation if necessary. */ if (stream_encroot) { if (!is_clone && !is_encroot) { err = lzc_change_key(fsname, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } stream_keylocation = fnvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); /* * Refresh the properties in case the call to * lzc_change_key() changed the value. */ zfs_refresh_properties(zhp); err = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, keylocation, sizeof (keylocation), NULL, NULL, 0, B_TRUE); if (err != 0) { zfs_close(zhp); goto error; } if (strcmp(keylocation, stream_keylocation) != 0) { err = zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), stream_keylocation); if (err != 0) { zfs_close(zhp); goto error; } } } /* * If the dataset is not flagged as an encryption root and is * currently an encryption root, force it to inherit from its * parent. The root of a raw send should never be * force-inherited. */ if (!stream_encroot && is_encroot && strcmp(top_zfs, fsname) != 0) { err = lzc_change_key(fsname, DCP_CMD_FORCE_INHERIT, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } zfs_close(zhp); } return (0); error: return (err); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, nvlist_t *renamed) { nvlist_t *local_nv, *deleted = NULL; avl_tree_t *local_avl; nvpair_t *fselem, *nextfselem; char *fromsnap; char newname[ZFS_MAX_DATASET_NAME_LEN]; char guidname[32]; int error; boolean_t needagain, progress, recursive; char *s1, *s2; fromsnap = fnvlist_lookup_string(stream_nv, "fromsnap"); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (flags->dryrun) return (0); again: needagain = progress = B_FALSE; deleted = fnvlist_alloc(); if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0) return (error); /* * Process deletes and renames */ for (fselem = nvlist_next_nvpair(local_nv, NULL); fselem; fselem = nextfselem) { nvlist_t *nvfs, *snaps; nvlist_t *stream_nvfs = NULL; nvpair_t *snapelem, *nextsnapelem; uint64_t fromguid = 0; uint64_t originguid = 0; uint64_t stream_originguid = 0; uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; char *fsname, *stream_fsname; nextfselem = nvlist_next_nvpair(local_nv, fselem); nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); fsname = fnvlist_lookup_string(nvfs, "name"); parent_fromsnap_guid = fnvlist_lookup_uint64(nvfs, "parentfromsnap"); (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); /* * First find the stream's fs, so we can check for * a different origin (due to "zfs promote") */ for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { uint64_t thisguid; thisguid = fnvpair_value_uint64(snapelem); stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); if (stream_nvfs != NULL) break; } /* check for promote */ (void) nvlist_lookup_uint64(stream_nvfs, "origin", &stream_originguid); if (stream_nvfs && originguid != stream_originguid) { switch (created_before(hdl, local_avl, stream_originguid, originguid)) { case 1: { /* promote it! */ nvlist_t *origin_nvfs; char *origin_fsname; origin_nvfs = fsavl_find(local_avl, originguid, NULL); origin_fsname = fnvlist_lookup_string( origin_nvfs, "name"); error = recv_promote(hdl, fsname, origin_fsname, flags); if (error == 0) progress = B_TRUE; break; } default: break; case -1: fsavl_destroy(local_avl); fnvlist_free(local_nv); return (-1); } /* * We had/have the wrong origin, therefore our * list of snapshots is wrong. Need to handle * them on the next pass. */ needagain = B_TRUE; continue; } for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nextsnapelem) { uint64_t thisguid; char *stream_snapname; nvlist_t *found, *props; nextsnapelem = nvlist_next_nvpair(snaps, snapelem); thisguid = fnvpair_value_uint64(snapelem); found = fsavl_find(stream_avl, thisguid, &stream_snapname); /* check for delete */ if (found == NULL) { char name[ZFS_MAX_DATASET_NAME_LEN]; if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); error = recv_destroy(hdl, name, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)thisguid); nvlist_add_boolean(deleted, guidname); continue; } stream_nvfs = found; if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", &props) && 0 == nvlist_lookup_nvlist(props, stream_snapname, &props)) { zfs_cmd_t zc = {"\0"}; zc.zc_cookie = B_TRUE; /* received */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", fsname, nvpair_name(snapelem)); if (zcmd_write_src_nvlist(hdl, &zc, props) == 0) { (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } } /* check for different snapname */ if (strcmp(nvpair_name(snapelem), stream_snapname) != 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; char tryname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); (void) snprintf(tryname, sizeof (name), "%s@%s", fsname, stream_snapname); error = recv_rename(hdl, name, tryname, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; } if (strcmp(stream_snapname, fromsnap) == 0) fromguid = thisguid; } /* check for delete */ if (stream_nvfs == NULL) { if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); nvlist_add_boolean(deleted, guidname); continue; } if (fromguid == 0) { if (flags->verbose) { (void) printf("local fs %s does not have " "fromsnap (%s in stream); must have " "been deleted locally; ignoring\n", fsname, fromsnap); } continue; } stream_fsname = fnvlist_lookup_string(stream_nvfs, "name"); stream_parent_fromsnap_guid = fnvlist_lookup_uint64( stream_nvfs, "parentfromsnap"); s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); /* * Check if we're going to rename based on parent guid change * and the current parent guid was also deleted. If it was then * rename will fail and is likely unneeded, so avoid this and * force an early retry to determine the new * parent_fromsnap_guid. */ if (stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) { sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); if (nvlist_exists(deleted, guidname)) { progress = B_TRUE; needagain = B_TRUE; goto doagain; } } /* * Check for rename. If the exact receive path is specified, it * does not count as a rename, but we still need to check the * datasets beneath it. */ if ((stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || ((flags->isprefix || strcmp(tofs, fsname) != 0) && (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAX_DATASET_NAME_LEN]; parent = fsavl_find(local_avl, stream_parent_fromsnap_guid, NULL); /* * NB: parent might not be found if we used the * tosnap for stream_parent_fromsnap_guid, * because the parent is a newly-created fs; * we'll be able to rename it after we recv the * new fs. */ if (parent != NULL) { char *pname; pname = fnvlist_lookup_string(parent, "name"); (void) snprintf(tryname, sizeof (tryname), "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } } newname[0] = '\0'; error = recv_rename(hdl, fsname, tryname, strlen(tofs)+1, newname, flags); if (renamed != NULL && newname[0] != '\0') { fnvlist_add_boolean(renamed, newname); } if (error) needagain = B_TRUE; else progress = B_TRUE; } } doagain: fsavl_destroy(local_avl); fnvlist_free(local_nv); fnvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ if (flags->verbose) (void) printf("another pass:\n"); goto again; } return (needagain || error != 0); } static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, char **top_zfs, nvlist_t *cmdprops) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; char *fromsnap = NULL; char *sendsnap = NULL; char *cp; char tofs[ZFS_MAX_DATASET_NAME_LEN]; char sendfs[ZFS_MAX_DATASET_NAME_LEN]; char errbuf[1024]; dmu_replay_record_t drre; int error; boolean_t anyerr = B_FALSE; boolean_t softerr = B_FALSE; boolean_t recursive, raw; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == DMU_COMPOUNDSTREAM); /* * Read in the nvlist from the stream. */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } } recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); raw = (nvlist_lookup_boolean(stream_nv, "raw") == 0); if (recursive && strchr(destname, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot stream")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } /* * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), flags->byteswap, NULL))) goto out; if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); drre.drr_u.drr_end.drr_checksum.zc_word[1] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); drre.drr_u.drr_end.drr_checksum.zc_word[2] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); drre.drr_u.drr_end.drr_checksum.zc_word[3] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); } if (drre.drr_type != DRR_END) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incorrect header checksum")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); if (drr->drr_payloadlen != 0) { nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); if ((stream_avl = fsavl_create(stream_fss)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "couldn't allocate avl tree")); error = zfs_error(hdl, EZFS_NOMEM, errbuf); goto out; } if (fromsnap != NULL && recursive) { nvlist_t *renamed = NULL; nvpair_t *pair = NULL; (void) strlcpy(tofs, destname, sizeof (tofs)); if (flags->isprefix) { struct drr_begin *drrb = &drr->drr_u.drr_begin; int i; if (flags->istail) { cp = strrchr(drrb->drr_toname, '/'); if (cp == NULL) { (void) strlcat(tofs, "/", sizeof (tofs)); i = 0; } else { i = (cp - drrb->drr_toname); } } else { i = strcspn(drrb->drr_toname, "/@"); } /* zfs_receive_one() will create_parents() */ (void) strlcat(tofs, &drrb->drr_toname[i], sizeof (tofs)); *strchr(tofs, '@') = '\0'; } if (!flags->dryrun && !flags->nomount) { renamed = fnvlist_alloc(); } softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, renamed); /* Unmount renamed filesystems before receiving. */ while ((pair = nvlist_next_nvpair(renamed, pair)) != NULL) { zfs_handle_t *zhp; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, nvpair_name(pair), ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp != NULL) { softerr |= changelist_prefix(clp); changelist_free(clp); } } } fnvlist_free(renamed); } } /* * Get the fs specified by the first path in the stream (the top level * specified by 'zfs send') and pass it to each invocation of * zfs_receive_one(). */ (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, sizeof (sendfs)); if ((cp = strchr(sendfs, '@')) != NULL) { *cp = '\0'; /* * Find the "sendsnap", the final snapshot in a replication * stream. zfs_receive_one() handles certain errors * differently, depending on if the contained stream is the * last one or not. */ sendsnap = (cp + 1); } /* Finally, receive each contained stream */ do { /* * we should figure out if it has a recoverable * error, in which case do a recv_skip() and drive on. * Note, if we fail due to already having this guid, * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ error = zfs_receive_impl(hdl, destname, NULL, flags, fd, sendfs, stream_nv, stream_avl, top_zfs, sendsnap, cmdprops); if (error == ENODATA) { error = 0; break; } anyerr |= error; } while (error == 0); if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { /* * Now that we have the fs's they sent us, try the * renames again. */ softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, NULL); } if (raw && softerr == 0 && *top_zfs != NULL) { softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs, stream_nv, stream_avl); } out: fsavl_destroy(stream_avl); fnvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) error = -1; return (error); } static void trunc_prop_errs(int truncated) { ASSERT(truncated != 0); if (truncated == 1) (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "1 more property could not be set\n")); else (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "%d more properties could not be set\n"), truncated); } static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); uint64_t payload_size; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* XXX would be great to use lseek if possible... */ drr = buf; while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), byteswap, NULL) == 0) { if (byteswap) drr->drr_type = BSWAP_32(drr->drr_type); switch (drr->drr_type) { case DRR_BEGIN: if (drr->drr_payloadlen != 0) { (void) recv_read(hdl, fd, buf, drr->drr_payloadlen, B_FALSE, NULL); } break; case DRR_END: free(buf); return (0); case DRR_OBJECT: if (byteswap) { drr->drr_u.drr_object.drr_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_bonuslen); drr->drr_u.drr_object.drr_raw_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_raw_bonuslen); } payload_size = DRR_OBJECT_PAYLOAD_SIZE(&drr->drr_u.drr_object); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE: if (byteswap) { drr->drr_u.drr_write.drr_logical_size = BSWAP_64( drr->drr_u.drr_write.drr_logical_size); drr->drr_u.drr_write.drr_compressed_size = BSWAP_64( drr->drr_u.drr_write.drr_compressed_size); } payload_size = DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); assert(payload_size <= SPA_MAXBLOCKSIZE); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { drr->drr_u.drr_spill.drr_length = BSWAP_64(drr->drr_u.drr_spill.drr_length); drr->drr_u.drr_spill.drr_compressed_size = BSWAP_64(drr->drr_u.drr_spill. drr_compressed_size); } payload_size = DRR_SPILL_PAYLOAD_SIZE(&drr->drr_u.drr_spill); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE_EMBEDDED: if (byteswap) { drr->drr_u.drr_write_embedded.drr_psize = BSWAP_32(drr->drr_u.drr_write_embedded. drr_psize); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; case DRR_OBJECT_RANGE: case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type")); free(buf); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } free(buf); return (-1); } static void recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, boolean_t resumable, boolean_t checksum) { char target_fs[ZFS_MAX_DATASET_NAME_LEN]; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, (checksum ? "checksum mismatch" : "incomplete stream"))); if (!resumable) return; (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); *strchr(target_fs, '@') = '\0'; zfs_handle_t *zhp = zfs_open(hdl, target_fs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return; char token_buf[ZFS_MAXPROPLEN]; int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (error == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream.\n" "Partially received snapshot is saved.\n" "A resuming stream can be generated on the sending " "system by running:\n" " zfs send -t %s"), token_buf); } zfs_close(zhp); } /* * Prepare a new nvlist of properties that are to override (-o) or be excluded * (-x) from the received dataset * recvprops: received properties from the send stream * cmdprops: raw input properties from command line * origprops: properties, both locally-set and received, currently set on the * target dataset if it exists, NULL otherwise. * oxprops: valid output override (-o) and excluded (-x) properties */ static int zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, char *fsname, boolean_t zoned, boolean_t recursive, boolean_t newfs, boolean_t raw, boolean_t toplevel, nvlist_t *recvprops, nvlist_t *cmdprops, nvlist_t *origprops, nvlist_t **oxprops, uint8_t **wkeydata_out, uint_t *wkeylen_out, const char *errbuf) { nvpair_t *nvp; nvlist_t *oprops, *voprops; zfs_handle_t *zhp = NULL; zpool_handle_t *zpool_hdl = NULL; char *cp; int ret = 0; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; if (nvlist_empty(cmdprops)) return (0); /* No properties to override or exclude */ *oxprops = fnvlist_alloc(); oprops = fnvlist_alloc(); strlcpy(namebuf, fsname, ZFS_MAX_DATASET_NAME_LEN); /* * Get our dataset handle. The target dataset may not exist yet. */ if (zfs_dataset_exists(hdl, namebuf, ZFS_TYPE_DATASET)) { zhp = zfs_open(hdl, namebuf, ZFS_TYPE_DATASET); if (zhp == NULL) { ret = -1; goto error; } } /* open the zpool handle */ cp = strchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; zpool_hdl = zpool_open(hdl, namebuf); if (zpool_hdl == NULL) { ret = -1; goto error; } /* restore namebuf to match fsname for later use */ if (cp != NULL) *cp = '/'; /* * first iteration: process excluded (-x) properties now and gather * added (-o) properties to be later processed by zfs_valid_proplist() */ nvp = NULL; while ((nvp = nvlist_next_nvpair(cmdprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); /* "origin" is processed separately, don't handle it here */ if (prop == ZFS_PROP_ORIGIN) continue; /* raw streams can't override encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set or excluded for raw streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* incremental streams can only exclude encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && !newfs && nvpair_type(nvp) != DATA_TYPE_BOOLEAN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set for incremental streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } switch (nvpair_type(nvp)) { case DATA_TYPE_BOOLEAN: /* -x property */ /* * DATA_TYPE_BOOLEAN is the way we're asked to "exclude" * a property: this is done by forcing an explicit * inherit on the destination so the effective value is * not the one we received from the send stream. */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: %s: property '%s' does not " "apply to datasets of this type\n"), fsname, name); continue; } /* * We do this only if the property is not already * locally-set, in which case its value will take * priority over the received anyway. */ if (nvlist_exists(origprops, name)) { nvlist_t *attrs; char *source = NULL; attrs = fnvlist_lookup_nvlist(origprops, name); if (nvlist_lookup_string(attrs, ZPROP_SOURCE, &source) == 0 && strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } /* * We can't force an explicit inherit on non-inheritable * properties: if we're asked to exclude this kind of * values we remove them from "recvprops" input nvlist. */ if (!zfs_prop_inheritable(prop) && !zfs_prop_user(name) && /* can be inherited too */ nvlist_exists(recvprops, name)) fnvlist_remove(recvprops, name); else fnvlist_add_nvpair(*oxprops, nvp); break; case DATA_TYPE_STRING: /* -o property=value */ /* * we're trying to override a property that does not * make sense for this type of dataset, but we don't * want to fail if the receive is recursive: this comes * in handy when the send stream contains, for * instance, a child ZVOL and we're trying to receive * it with "-o atime=on" */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { if (recursive) continue; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' does not apply to datasets " "of this type"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } fnvlist_add_nvpair(oprops, nvp); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be a string or boolean"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (toplevel) { /* convert override strings properties to native */ if ((voprops = zfs_valid_proplist(hdl, ZFS_TYPE_DATASET, oprops, zoned, zhp, zpool_hdl, B_FALSE, errbuf)) == NULL) { ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * zfs_crypto_create() requires the parent name. Get it * by truncating the fsname copy stored in namebuf. */ cp = strrchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; if (!raw && zfs_crypto_create(hdl, namebuf, voprops, NULL, B_FALSE, wkeydata_out, wkeylen_out) != 0) { fnvlist_free(voprops); ret = zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); goto error; } /* second pass: process "-o" properties */ fnvlist_merge(*oxprops, voprops); fnvlist_free(voprops); } else { /* override props on child dataset are inherited */ nvp = NULL; while ((nvp = nvlist_next_nvpair(oprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); fnvlist_add_boolean(*oxprops, name); } } error: if (zhp != NULL) zfs_close(zhp); if (zpool_hdl != NULL) zpool_close(zpool_hdl); fnvlist_free(oprops); return (ret); } /* * Restores a backup of tosnap from the file descriptor specified by infd. */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { time_t begin_time; int ioctl_err, ioctl_errno, err; char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[1024]; const char *chopprefix; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs, stream_resumingnewfs; boolean_t newprops = B_FALSE; uint64_t read_bytes = 0; uint64_t errflags = 0; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; nvlist_t *snapholds_nvlist = NULL; zprop_errflags_t prop_errflags; nvlist_t *prop_errors = NULL; boolean_t recursive; char *snapname = NULL; char destsnap[MAXPATHLEN * 2]; char origin[MAXNAMELEN]; char name[MAXPATHLEN]; char tmp_keylocation[MAXNAMELEN]; nvlist_t *rcvprops = NULL; /* props received from the send stream */ nvlist_t *oxprops = NULL; /* override (-o) and exclude (-x) props */ nvlist_t *origprops = NULL; /* original props (if destination exists) */ zfs_type_t type; boolean_t toplevel = B_FALSE; boolean_t zoned = B_FALSE; boolean_t hastoken = B_FALSE; boolean_t redacted; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; begin_time = time(NULL); bzero(origin, MAXNAMELEN); bzero(tmp_keylocation, MAXNAMELEN); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); /* Did the user request holds be skipped via zfs recv -k? */ boolean_t holds = flags->holds && !flags->skipholds; if (stream_avl != NULL) { char *keylocation = NULL; nvlist_t *lookup = NULL; nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, &snapname); (void) nvlist_lookup_uint64(fs, "parentfromsnap", &parent_snapguid); err = nvlist_lookup_nvlist(fs, "props", &rcvprops); if (err) { rcvprops = fnvlist_alloc(); newprops = B_TRUE; } /* * The keylocation property may only be set on encryption roots, * but this dataset might not become an encryption root until * recv_fix_encryption_hierarchy() is called. That function * will fixup the keylocation anyway, so we temporarily unset * the keylocation for now to avoid any errors from the receive * ioctl. */ err = nvlist_lookup_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); if (err == 0) { strcpy(tmp_keylocation, keylocation); (void) nvlist_remove_all(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); } if (flags->canmountoff) { fnvlist_add_uint64(rcvprops, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0); } else if (newprops) { /* nothing in rcvprops, eliminate it */ fnvlist_free(rcvprops); rcvprops = NULL; newprops = B_FALSE; } if (0 == nvlist_lookup_nvlist(fs, "snapprops", &lookup)) { snapprops_nvlist = fnvlist_lookup_nvlist(lookup, snapname); } if (holds) { if (0 == nvlist_lookup_nvlist(fs, "snapholds", &lookup)) { snapholds_nvlist = fnvlist_lookup_nvlist( lookup, snapname); } } } cp = NULL; /* * Determine how much of the snapshot name stored in the stream * we are going to tack on to the name they specified on the * command line, and how much we are going to chop off. * * If they specified a snapshot, chop the entire name stored in * the stream. */ if (flags->istail) { /* * A filesystem was specified with -e. We want to tack on only * the tail of the sent snapshot path. */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -e")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strrchr(sendfs, '/'); if (chopprefix == NULL) { /* * The tail is the poolname, so we need to * prepend a path separator. */ int len = strlen(drrb->drr_toname); cp = malloc(len + 2); cp[0] = '/'; (void) strcpy(&cp[1], drrb->drr_toname); chopprefix = cp; } else { chopprefix = drrb->drr_toname + (chopprefix - sendfs); } } else if (flags->isprefix) { /* * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path * (all but the pool name). */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -d")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strchr(drrb->drr_toname, '/'); if (chopprefix == NULL) chopprefix = strchr(drrb->drr_toname, '@'); } else if (strchr(tosnap, '@') == NULL) { /* * If a filesystem was specified without -d or -e, we want to * tack on everything after the fs specified by 'zfs send'. */ chopprefix = drrb->drr_toname + strlen(sendfs); } else { /* A snapshot was specified as an exact path (no -d or -e). */ if (recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot " "stream")); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); } ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); ASSERT(chopprefix > drrb->drr_toname || strchr(sendfs, '/') == NULL); ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname) || strchr(sendfs, '/') == NULL); ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || chopprefix[0] == '\0'); /* * Determine name of destination snapshot. */ (void) strlcpy(destsnap, tosnap, sizeof (destsnap)); (void) strlcat(destsnap, chopprefix, sizeof (destsnap)); free(cp); if (!zfs_name_valid(destsnap, ZFS_TYPE_SNAPSHOT)) { err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } /* * Determine the name of the origin snapshot. */ if (originsnap) { (void) strlcpy(origin, originsnap, sizeof (origin)); if (flags->verbose) (void) printf("using provided clone origin %s\n", origin); } else if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, destsnap, drrb->drr_fromguid, B_FALSE, origin) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), destsnap); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } if (flags->verbose) (void) printf("found clone origin %s\n", origin); } if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_DEDUP)) { (void) fprintf(stderr, gettext("ERROR: \"zfs receive\" no longer supports " "deduplicated send streams. Use\n" "the \"zstream redup\" command to convert this stream " "to a regular,\n" "non-deduplicated stream.\n")); err = zfs_error(hdl, EZFS_NOTSUP, errbuf); goto out; } boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING; boolean_t raw = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RAW; boolean_t embedded = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA; stream_wantsnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; stream_resumingnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && resuming; if (stream_wantsnewfs) { /* * if the parent fs does not exist, look for it based on * the parent snap GUID */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive new filesystem stream")); (void) strcpy(name, destsnap); cp = strrchr(name, '/'); if (cp) *cp = '\0'; if (cp && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char suffix[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(suffix, strrchr(destsnap, '/')); if (guid_to_name(hdl, name, parent_snapguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strcat(destsnap, suffix); } } } else { /* * If the fs does not exist, look for it based on the * fromsnap GUID. */ if (resuming) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive resume stream")); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive incremental stream")); } (void) strcpy(name, destsnap); *strchr(name, '@') = '\0'; /* * If the exact receive path was specified and this is the * topmost path in the stream, then if the fs does not exist we * should look no further. */ if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + strlen(sendfs)) != '\0' && *chopprefix != '@')) && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char snap[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(snap, strchr(destsnap, '@')); if (guid_to_name(hdl, name, drrb->drr_fromguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strcat(destsnap, snap); } } } (void) strcpy(name, destsnap); *strchr(name, '@') = '\0'; redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp; boolean_t encrypted; (void) strcpy(zc.zc_name, name); /* * Destination fs exists. It must be one of these cases: * - an incremental send stream * - the stream specifies a new fs (full stream or clone) * and they want us to blow away the existing fs (and * have therefore specified -F and removed any snapshots) * - we are resuming a failed receive. */ if (stream_wantsnewfs) { boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL; if (!flags->force) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" "must specify -F to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (zfs_ioctl(hdl, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has snapshots (eg. %s)\n" "must destroy them to overwrite it"), zc.zc_name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && strrchr(name, '/') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s is the root dataset\n" "cannot overwrite with a ZVOL"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && zfs_ioctl(hdl, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has children (eg. %s)\n" "cannot overwrite with a ZVOL"), zc.zc_name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); goto out; } } if ((zhp = zfs_open(hdl, name, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { err = -1; goto out; } if (stream_wantsnewfs && zhp->zfs_dmustats.dds_origin[0]) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' is a clone\n" "must destroy it to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } /* * Raw sends can not be performed as an incremental on top * of existing unencrypted datasets. zfs recv -F can't be * used to blow away an existing encrypted filesystem. This * is because it would require the dsl dir to point to the * new key (or lack of a key) and the old key at the same * time. The -F flag may still be used for deleting * intermediate snapshots that would otherwise prevent the * receive from working. */ encrypted = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF; if (!stream_wantsnewfs && !encrypted && raw) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot perform raw receive on top of " "existing unencrypted dataset")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (stream_wantsnewfs && flags->force && ((raw && !encrypted) || encrypted)) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "zfs receive -F cannot be used to destroy an " "encrypted filesystem or overwrite an " "unencrypted one with an encrypted one")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && (stream_wantsnewfs || stream_resumingnewfs)) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->forceunmount ? MS_FORCE : 0); if (clp == NULL) { zfs_close(zhp); err = -1; goto out; } if (changelist_prefix(clp) != 0) { changelist_free(clp); zfs_close(zhp); err = -1; goto out; } } /* * If we are resuming a newfs, set newfs here so that we will * mount it if the recv succeeds this time. We can tell * that it was a newfs on the first recv because the fs * itself will be inconsistent (if the fs existed when we * did the first recv, we would have received it into * .../%recv). */ if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) newfs = B_TRUE; /* we want to know if we're zoned when validating -o|-x props */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); /* may need this info later, get it now we have zhp around */ if (zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) hastoken = B_TRUE; /* gather existing properties on destination */ origprops = fnvlist_alloc(); fnvlist_merge(origprops, zhp->zfs_props); fnvlist_merge(origprops, zhp->zfs_user_props); zfs_close(zhp); } else { zfs_handle_t *zhp; /* * Destination filesystem does not exist. Therefore we better * be creating a new filesystem (either from a full backup, or * a clone). It would therefore be invalid if the user * specified only the pool name (i.e. if the destination name * contained no slash character). */ cp = strrchr(name, '/'); if (!stream_wantsnewfs || cp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' does not exist"), name); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } /* * Trim off the final dataset component so we perform the * recvbackup ioctl to the filesystems's parent. */ *cp = '\0'; if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, destsnap, strlen(tosnap)) != 0) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } /* validate parent */ zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent '%s' is not a filesystem"), name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); zfs_close(zhp); goto out; } zfs_close(zhp); newfs = B_TRUE; *cp = '/'; } if (flags->verbose) { (void) printf("%s %s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, destsnap); (void) fflush(stdout); } /* * If this is the top-level dataset, record it so we can use it * for recursive operations later. */ if (top_zfs != NULL && (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) { toplevel = B_TRUE; if (*top_zfs == NULL) *top_zfs = zfs_strdup(hdl, name); } if (drrb->drr_type == DMU_OST_ZVOL) { type = ZFS_TYPE_VOLUME; } else if (drrb->drr_type == DMU_OST_ZFS) { type = ZFS_TYPE_FILESYSTEM; } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type: 0x%d"), drrb->drr_type); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if ((err = zfs_setup_cmdline_props(hdl, type, name, zoned, recursive, stream_wantsnewfs, raw, toplevel, rcvprops, cmdprops, origprops, &oxprops, &wkeydata, &wkeylen, errbuf)) != 0) goto out; /* * When sending with properties (zfs send -p), the encryption property * is not included because it is a SETONCE property and therefore * treated as read only. However, we are always able to determine its * value because raw sends will include it in the DRR_BDEGIN payload * and non-raw sends with properties are not allowed for encrypted * datasets. Therefore, if this is a non-raw properties stream, we can * infer that the value should be ZIO_CRYPT_OFF and manually add that * to the received properties. */ if (stream_wantsnewfs && !raw && rcvprops != NULL && !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { if (oxprops == NULL) oxprops = fnvlist_alloc(); fnvlist_add_uint64(oxprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); } if (flags->dryrun) { void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); /* * We have read the DRR_BEGIN record, but we have * not yet read the payload. For non-dryrun sends * this will be done by the kernel, so we must * emulate that here, before attempting to read * more records. */ err = recv_read(hdl, infd, buf, drr->drr_payloadlen, flags->byteswap, NULL); free(buf); if (err != 0) goto out; err = recv_skip(hdl, infd, flags->byteswap); goto out; } err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, NULL, &prop_errors); ioctl_errno = ioctl_err; prop_errflags = errflags; if (err == 0) { nvpair_t *prop_err = NULL; while ((prop_err = nvlist_next_nvpair(prop_errors, prop_err)) != NULL) { char tbuf[1024]; zfs_prop_t prop; int intval; prop = zfs_name_to_prop(nvpair_name(prop_err)); (void) nvpair_value_int32(prop_err, &intval); if (strcmp(nvpair_name(prop_err), ZPROP_N_MORE_ERRORS) == 0) { trunc_prop_errs(intval); break; } else if (snapname == NULL || finalsnap == NULL || strcmp(finalsnap, snapname) == 0 || strcmp(nvpair_name(prop_err), zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { /* * Skip the special case of, for example, * "refquota", errors on intermediate * snapshots leading up to a final one. * That's why we have all of the checks above. * * See zfs_ioctl.c's extract_delay_props() for * a list of props which can fail on * intermediate snapshots, but shouldn't * affect the overall receive. */ (void) snprintf(tbuf, sizeof (tbuf), dgettext(TEXT_DOMAIN, "cannot receive %s property on %s"), nvpair_name(prop_err), name); zfs_setprop_error(hdl, prop, intval, tbuf); } } } if (err == 0 && snapprops_nvlist) { zfs_cmd_t zc = {"\0"}; (void) strcpy(zc.zc_name, destsnap); zc.zc_cookie = B_TRUE; /* received */ if (zcmd_write_src_nvlist(hdl, &zc, snapprops_nvlist) == 0) { (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } } if (err == 0 && snapholds_nvlist) { nvpair_t *pair; nvlist_t *holds, *errors = NULL; int cleanup_fd = -1; VERIFY(0 == nvlist_alloc(&holds, 0, KM_SLEEP)); for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL); pair != NULL; pair = nvlist_next_nvpair(snapholds_nvlist, pair)) { fnvlist_add_string(holds, destsnap, nvpair_name(pair)); } (void) lzc_hold(holds, cleanup_fd, &errors); fnvlist_free(snapholds_nvlist); fnvlist_free(holds); } if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { /* * It may be that this snapshot already exists, * in which case we want to consume & ignore it * rather than failing. */ avl_tree_t *local_avl; nvlist_t *local_nv, *fs; cp = strchr(destsnap, '@'); /* * XXX Do this faster by just iterating over snaps in * this fs. Also if zc_value does not exist, we will * get a strange "does not exist" error message. */ *cp = '\0'; if (gather_nvlist(hdl, destsnap, NULL, NULL, B_FALSE, B_TRUE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); fnvlist_free(local_nv); if (fs != NULL) { if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", destsnap); } err = ioctl_err = recv_skip(hdl, infd, flags->byteswap); } } *cp = '@'; } if (ioctl_err != 0) { switch (ioctl_errno) { case ENODEV: cp = strchr(destsnap, '@'); *cp = '\0'; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "most recent snapshot of %s does not\n" "match incremental source"), destsnap); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); *cp = '@'; break; case ETXTBSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s has been modified\n" "since most recent snapshot"), name); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EACCES: if (raw && stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to create encryption key")); } else if (raw && !stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption key does not match " "existing key")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "inherited key must be loaded")); } (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); break; case EEXIST: cp = strchr(destsnap, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination already exists")); (void) zfs_error_fmt(hdl, EZFS_EXISTS, dgettext(TEXT_DOMAIN, "cannot restore to %s"), destsnap); *cp = '@'; break; case EINVAL: if (flags->resumable) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "kernel modules must be upgraded to " "receive this stream.")); } else if (embedded && !raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incompatible embedded data stream " "feature with encrypted receive.")); } (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: case ZFS_ERR_STREAM_TRUNCATED: recv_ecksum_set_aux(hdl, destsnap, flags->resumable, ioctl_err == ECKSUM); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental send stream requires -L " "(--large-block), to match previous receive.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EDQUOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s space quota exceeded."), name); (void) zfs_error(hdl, EZFS_NOSPC, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid missing. See errata %u at " "https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-ER."), ZPOOL_ERRATA_ZOL_8308_ENCRYPTION); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid mismatch. See the 'zfs receive' " "man page section\n discussing the limitations " "of raw encrypted send streams.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Spill block flag missing for raw send.\n" "The zfs software on the sending system must " "be updated.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case EBUSY: if (hastoken) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s contains " "partially-complete state from " "\"zfs receive -s\"."), name); (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; } fallthrough; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } } /* * Mount the target filesystem (if created). Also mount any * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ if (clp) { if (!flags->nomount) err |= changelist_postfix(clp); changelist_free(clp); } if ((newfs || stream_avl) && type == ZFS_TYPE_FILESYSTEM && !redacted) flags->domount = B_TRUE; if (prop_errflags & ZPROP_ERR_NOCLEAR) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to clear unreceived properties on %s"), name); (void) fprintf(stderr, "\n"); } if (prop_errflags & ZPROP_ERR_NORESTORE) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to restore original properties on %s"), name); (void) fprintf(stderr, "\n"); } if (err || ioctl_err) { err = -1; goto out; } if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = read_bytes; time_t delta = time(NULL) - begin_time; if (delta == 0) delta = 1; zfs_nicebytes(bytes, buf1, sizeof (buf1)); zfs_nicebytes(bytes/delta, buf2, sizeof (buf1)); (void) printf("received %s stream in %lld seconds (%s/sec)\n", buf1, (longlong_t)delta, buf2); } err = 0; out: if (prop_errors != NULL) fnvlist_free(prop_errors); if (tmp_keylocation[0] != '\0') { fnvlist_add_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation); } if (newprops) fnvlist_free(rcvprops); fnvlist_free(oxprops); fnvlist_free(origprops); return (err); } /* * Check properties we were asked to override (both -o|-x) */ static boolean_t zfs_receive_checkprops(libzfs_handle_t *hdl, nvlist_t *props, const char *errbuf) { nvpair_t *nvp; zfs_prop_t prop; const char *name; nvp = NULL; while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { name = nvpair_name(nvp); prop = zfs_name_to_prop(name); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), name); return (B_FALSE); } continue; } /* * "origin" is readonly but is used to receive datasets as * clones so we don't raise an error here */ if (prop == ZFS_PROP_ORIGIN) continue; /* encryption params have their own verification later */ if (prop == ZFS_PROP_ENCRYPTION || zfs_prop_encryption_key_param(prop)) continue; /* * cannot override readonly, set-once and other specific * settable properties */ if (zfs_prop_readonly(prop) || prop == ZFS_PROP_VERSION || prop == ZFS_PROP_VOLSIZE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), name); return (B_FALSE); } } return (B_TRUE); } static int zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { int err; dmu_replay_record_t drr, drr_noswap; struct drr_begin *drrb = &drr.drr_u.drr_begin; char errbuf[1024]; zio_cksum_t zcksum = { { 0 } }; uint64_t featureflags; int hdrtype; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* check cmdline props, raise an error if they cannot be received */ if (!zfs_receive_checkprops(hdl, cmdprops, errbuf)) { return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (originsnap && !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " "(%s) does not exist"), originsnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, &zcksum))) return (err); if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { /* It's the double end record at the end of a package */ return (ENODATA); } /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in * recv_read() above; do it again correctly. */ bzero(&zcksum, sizeof (zio_cksum_t)); fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad magic number)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); if (!DMU_STREAM_SUPPORTED(featureflags) || (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has unsupported feature, feature flags = %llx"), (unsigned long long)featureflags); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } /* Holds feature is set once in the compound stream header. */ if (featureflags & DMU_BACKUP_FEATURE_HOLDS) flags->holds = B_TRUE; if (strchr(drrb->drr_toname, '@') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad snapshot name)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; if (sendfs == NULL) { /* * We were not called from zfs_receive_package(). Get * the fs specified by 'zfs send'. */ char *cp; (void) strlcpy(nonpackage_sendfs, drr.drr_u.drr_begin.drr_toname, sizeof (nonpackage_sendfs)); if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) *cp = '\0'; sendfs = nonpackage_sendfs; VERIFY(finalsnap == NULL); } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, finalsnap, cmdprops)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, &zcksum, top_zfs, cmdprops)); } } /* * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. * (-1 will override -2, if -1 and the resumable flag was specified the * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; struct stat sb; char *originsnap = NULL; /* * The only way fstat can fail is if we do not have a valid file * descriptor. */ if (fstat(infd, &sb) == -1) { perror("fstat"); return (-2); } /* * It is not uncommon for gigabytes to be processed in zfs receive. * Speculatively increase the buffer size if supported by the platform. */ if (S_ISFIFO(sb.st_mode)) libzfs_set_pipe_max(infd); if (props) { err = nvlist_lookup_string(props, "origin", &originsnap); if (err && err != ENOENT) return (err); } err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, stream_avl, &top_zfs, NULL, props); if (err == 0 && !flags->nomount && flags->domount && top_zfs) { zfs_handle_t *zhp = NULL; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { err = -1; goto out; } else { if (zhp->zfs_type == ZFS_TYPE_VOLUME) { zfs_close(zhp); goto out; } clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, CL_GATHER_MOUNT_ALWAYS, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) { err = -1; goto out; } /* mount and share received datasets */ err = changelist_postfix(clp); changelist_free(clp); if (err != 0) err = -1; } } out: if (top_zfs) free(top_zfs); return (err); } diff --git a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_compat.c b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_compat.c index 4d7421df8d3b..f143f9cb63b3 100644 --- a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_compat.c +++ b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_compat.c @@ -1,355 +1,360 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2013 Martin Matuska . All rights reserved. */ -#include #include "../../libzfs_impl.h" #include #include #include #include #include #include #include #include #ifdef IN_BASE #define ZFS_KMOD "zfs" #else #define ZFS_KMOD "openzfs" #endif void libzfs_set_pipe_max(int infd) { /* FreeBSD automatically resizes */ } static int execvPe(const char *name, const char *path, char * const *argv, char * const *envp) { const char **memp; size_t cnt, lp, ln; int eacces, save_errno; char buf[MAXPATHLEN]; const char *bp, *np, *op, *p; struct stat sb; eacces = 0; /* If it's an absolute or relative path name, it's easy. */ if (strchr(name, '/')) { bp = name; op = NULL; goto retry; } bp = buf; /* If it's an empty path name, fail in the usual POSIX way. */ if (*name == '\0') { errno = ENOENT; return (-1); } op = path; ln = strlen(name); while (op != NULL) { np = strchrnul(op, ':'); /* * It's a SHELL path -- double, leading and trailing colons * mean the current directory. */ if (np == op) { /* Empty component. */ p = "."; lp = 1; } else { /* Non-empty component. */ p = op; lp = np - op; } /* Advance to the next component or terminate after this. */ if (*np == '\0') op = NULL; else op = np + 1; /* * If the path is too long complain. This is a possible * security issue; given a way to make the path too long * the user may execute the wrong program. */ if (lp + ln + 2 > sizeof (buf)) { (void) write(STDERR_FILENO, "execvP: ", 8); (void) write(STDERR_FILENO, p, lp); (void) write(STDERR_FILENO, ": path too long\n", 16); continue; } bcopy(p, buf, lp); buf[lp] = '/'; bcopy(name, buf + lp + 1, ln); buf[lp + ln + 1] = '\0'; retry: (void) execve(bp, argv, envp); switch (errno) { case E2BIG: goto done; case ELOOP: case ENAMETOOLONG: case ENOENT: break; case ENOEXEC: for (cnt = 0; argv[cnt]; ++cnt) ; /* * cnt may be 0 above; always allocate at least * 3 entries so that we can at least fit "sh", bp, and * the NULL terminator. We can rely on cnt to take into * account the NULL terminator in all other scenarios, * as we drop argv[0]. */ memp = alloca(MAX(3, cnt + 2) * sizeof (char *)); if (memp == NULL) { /* errno = ENOMEM; XXX override ENOEXEC? */ goto done; } if (cnt > 0) { memp[0] = argv[0]; memp[1] = bp; bcopy(argv + 1, memp + 2, cnt * sizeof (char *)); } else { memp[0] = "sh"; memp[1] = bp; memp[2] = NULL; } (void) execve(_PATH_BSHELL, __DECONST(char **, memp), envp); goto done; case ENOMEM: goto done; case ENOTDIR: break; case ETXTBSY: /* * We used to retry here, but sh(1) doesn't. */ goto done; default: /* * EACCES may be for an inaccessible directory or * a non-executable file. Call stat() to decide * which. This also handles ambiguities for EFAULT * and EIO, and undocumented errors like ESTALE. * We hope that the race for a stat() is unimportant. */ save_errno = errno; if (stat(bp, &sb) != 0) break; if (save_errno == EACCES) { eacces = 1; continue; } errno = save_errno; goto done; } } if (eacces) errno = EACCES; else errno = ENOENT; done: return (-1); } int execvpe(const char *name, char * const argv[], char * const envp[]) { const char *path; /* Get the path we're searching. */ if ((path = getenv("PATH")) == NULL) path = _PATH_DEFPATH; return (execvPe(name, path, argv, envp)); } #define ERRBUFLEN 256 static __thread char errbuf[ERRBUFLEN]; const char * libzfs_error_init(int error) { char *msg = errbuf; size_t len, msglen = ERRBUFLEN; if (modfind("zfs") < 0) { len = snprintf(msg, msglen, dgettext(TEXT_DOMAIN, "Failed to load %s module: "), ZFS_KMOD); msg += len; msglen -= len; } (void) snprintf(msg, msglen, "%s", strerror(error)); return (errbuf); } int zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) { - return (zfs_ioctl_fd(hdl->libzfs_fd, request, zc)); + return (lzc_ioctl_fd(hdl->libzfs_fd, request, zc)); } /* * Verify the required ZFS_DEV device is available and optionally attempt * to load the ZFS modules. Under normal circumstances the modules * should already have been loaded by some external mechanism. */ int libzfs_load_module(void) { /* * XXX: kldfind(ZFS_KMOD) would be nice here, but we retain * modfind("zfs") so out-of-base openzfs userland works with the * in-base module. */ if (modfind("zfs") < 0) { /* Not present in kernel, try loading it. */ if (kldload(ZFS_KMOD) < 0 && errno != EEXIST) { return (errno); } } return (0); } int zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) { return (0); } int zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) { return (0); } int find_shares_object(differ_info_t *di) { return (0); } +int +zfs_destroy_snaps_nvl_os(libzfs_handle_t *hdl, nvlist_t *snaps) +{ + return (0); +} + /* * Attach/detach the given filesystem to/from the given jail. */ int zfs_jail(zfs_handle_t *zhp, int jailid, int attach) { libzfs_handle_t *hdl = zhp->zfs_hdl; zfs_cmd_t zc = {"\0"}; char errbuf[1024]; unsigned long cmd; int ret; if (attach) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name); } switch (zhp->zfs_type) { case ZFS_TYPE_VOLUME: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volumes can not be jailed")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); case ZFS_TYPE_SNAPSHOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots can not be jailed")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); case ZFS_TYPE_BOOKMARK: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bookmarks can not be jailed")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); case ZFS_TYPE_POOL: case ZFS_TYPE_FILESYSTEM: /* OK */ ; } assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = DMU_OST_ZFS; zc.zc_zoneid = jailid; cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL; if ((ret = zfs_ioctl(hdl, cmd, &zc)) != 0) zfs_standard_error(hdl, errno, errbuf); return (ret); } /* * Set loader options for next boot. */ int zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid, const char *command) { zfs_cmd_t zc = {"\0"}; nvlist_t *args; int error; args = fnvlist_alloc(); fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid); fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid); fnvlist_add_string(args, "command", command); error = zcmd_write_src_nvlist(hdl, &zc, args); if (error == 0) error = zfs_ioctl(hdl, ZFS_IOC_NEXTBOOT, &zc); zcmd_free_nvlists(&zc); nvlist_free(args); return (error); } /* * Fill given version buffer with zfs kernel version. * Returns 0 on success, and -1 on error (with errno set) */ int zfs_version_kernel(char *version, int len) { size_t l = len; return (sysctlbyname("vfs.zfs.version.module", version, &l, NULL, 0)); } diff --git a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c deleted file mode 100644 index 18b93fe27969..000000000000 --- a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c +++ /dev/null @@ -1,432 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2013 Xin Li . All rights reserved. - * Copyright 2013 Martin Matuska . All rights reserved. - * Portions Copyright 2005, 2010, Oracle and/or its affiliates. - * All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "zfs_namecheck.h" -#include - -/* - * FreeBSD zfs_cmd compatibility with older binaries - * appropriately remap/extend the zfs_cmd_t structure - */ -void -zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) -{ - -} -#if 0 -static int -zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag, - nvlist_t **nvp) -{ - char *packed; - int error; - nvlist_t *list = NULL; - - /* - * Read in and unpack the user-supplied nvlist. - */ - if (size == 0) - return (EINVAL); - -#ifdef _KERNEL - packed = kmem_alloc(size, KM_SLEEP); - if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, - iflag)) != 0) { - kmem_free(packed, size); - return (error); - } -#else - packed = (void *)(uintptr_t)nvl; -#endif - - error = nvlist_unpack(packed, size, &list, 0); - -#ifdef _KERNEL - kmem_free(packed, size); -#endif - - if (error != 0) - return (error); - - *nvp = list; - return (0); -} - -static int -zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) -{ - char *packed = NULL; - int error = 0; - size_t size; - - VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); - -#ifdef _KERNEL - packed = kmem_alloc(size, KM_SLEEP); - VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, - KM_SLEEP) == 0); - - if (ddi_copyout(packed, - (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) - error = EFAULT; - kmem_free(packed, size); -#else - packed = (void *)(uintptr_t)zc->zc_nvlist_dst; - VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, - 0) == 0); -#endif - - zc->zc_nvlist_dst_size = size; - return (error); -} - -static void -zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl) -{ - nvlist_t **child; - nvlist_t *nvroot = NULL; - vdev_stat_t *vs; - uint_t c, children, nelem; - - if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - zfs_ioctl_compat_fix_stats_nvlist(child[c]); - } - } - - if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0) - zfs_ioctl_compat_fix_stats_nvlist(nvroot); - if ((nvlist_lookup_uint64_array(nvl, "stats", - (uint64_t **)&vs, &nelem) == 0)) { - nvlist_add_uint64_array(nvl, - ZPOOL_CONFIG_VDEV_STATS, - (uint64_t *)vs, nelem); - nvlist_remove(nvl, "stats", - DATA_TYPE_UINT64_ARRAY); - } -} - - -static int -zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc) -{ - nvlist_t *nv, *nvp = NULL; - nvpair_t *elem; - int error; - - if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, - zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) - return (error); - - if (nc == 5) { /* ZFS_IOC_POOL_STATS */ - elem = NULL; - while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { - if (nvpair_value_nvlist(elem, &nvp) == 0) - zfs_ioctl_compat_fix_stats_nvlist(nvp); - } - elem = NULL; - } else - zfs_ioctl_compat_fix_stats_nvlist(nv); - - error = zfs_ioctl_compat_put_nvlist(zc, nv); - - nvlist_free(nv); - - return (error); -} - -static int -zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc) -{ - nvlist_t *nv, *nva = NULL; - int error; - - if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, - zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) - return (error); - - if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) { - nvlist_add_nvlist(nv, "allocated", nva); - nvlist_remove(nv, "used", DATA_TYPE_NVLIST); - } - - if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) { - nvlist_add_nvlist(nv, "free", nva); - nvlist_remove(nv, "available", DATA_TYPE_NVLIST); - } - - error = zfs_ioctl_compat_put_nvlist(zc, nv); - - nvlist_free(nv); - - return (error); -} -#endif - -#ifdef _KERNEL -int -zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag) -{ - int error = 0; - - /* are we creating a clone? */ - if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0') - *vec = ZFS_IOC_CLONE; - - if (cflag == ZFS_CMD_COMPAT_V15) { - switch (*vec) { - - case 7: /* ZFS_IOC_POOL_SCRUB (v15) */ - zc->zc_cookie = POOL_SCAN_SCRUB; - break; - } - } - - return (error); -} - -void -zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) -{ - if (cflag == ZFS_CMD_COMPAT_V15) { - switch (vec) { - case ZFS_IOC_POOL_CONFIGS: - case ZFS_IOC_POOL_STATS: - case ZFS_IOC_POOL_TRYIMPORT: - zfs_ioctl_compat_fix_stats(zc, vec); - break; - case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ - zfs_ioctl_compat_pool_get_props(zc); - break; - } - } -} - -nvlist_t * -zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t *innvl, const int vec, - const int cflag) -{ - nvlist_t *nvl, *tmpnvl, *hnvl; - nvpair_t *elem; - char *poolname, *snapname; - int err; - - if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || - cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) - goto out; - - switch (vec) { - case ZFS_IOC_CREATE: - nvl = fnvlist_alloc(); - fnvlist_add_int32(nvl, "type", zc->zc_objset_type); - if (innvl != NULL) { - fnvlist_add_nvlist(nvl, "props", innvl); - nvlist_free(innvl); - } - return (nvl); - break; - case ZFS_IOC_CLONE: - nvl = fnvlist_alloc(); - fnvlist_add_string(nvl, "origin", zc->zc_value); - if (innvl != NULL) { - fnvlist_add_nvlist(nvl, "props", innvl); - nvlist_free(innvl); - } - return (nvl); - break; - case ZFS_IOC_SNAPSHOT: - if (innvl == NULL) - goto out; - nvl = fnvlist_alloc(); - fnvlist_add_nvlist(nvl, "props", innvl); - tmpnvl = fnvlist_alloc(); - snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); - fnvlist_add_boolean(tmpnvl, snapname); - kmem_free(snapname, strlen(snapname + 1)); - /* check if we are doing a recursive snapshot */ - if (zc->zc_cookie) - dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, - tmpnvl); - fnvlist_add_nvlist(nvl, "snaps", tmpnvl); - fnvlist_free(tmpnvl); - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - case ZFS_IOC_SPACE_SNAPS: - nvl = fnvlist_alloc(); - fnvlist_add_string(nvl, "firstsnap", zc->zc_value); - if (innvl != NULL) - nvlist_free(innvl); - return (nvl); - break; - case ZFS_IOC_DESTROY_SNAPS: - if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN) - goto out; - nvl = fnvlist_alloc(); - if (innvl != NULL) { - fnvlist_add_nvlist(nvl, "snaps", innvl); - } else { - /* - * We are probably called by even older binaries, - * allocate and populate nvlist with recursive - * snapshots - */ - if (zfs_component_namecheck(zc->zc_value, NULL, - NULL) == 0) { - tmpnvl = fnvlist_alloc(); - if (dmu_get_recursive_snaps_nvl(zc->zc_name, - zc->zc_value, tmpnvl) == 0) - fnvlist_add_nvlist(nvl, "snaps", - tmpnvl); - nvlist_free(tmpnvl); - } - } - if (innvl != NULL) - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - case ZFS_IOC_HOLD: - nvl = fnvlist_alloc(); - tmpnvl = fnvlist_alloc(); - if (zc->zc_cleanup_fd != -1) - fnvlist_add_int32(nvl, "cleanup_fd", - (int32_t)zc->zc_cleanup_fd); - if (zc->zc_cookie) { - hnvl = fnvlist_alloc(); - if (dmu_get_recursive_snaps_nvl(zc->zc_name, - zc->zc_value, hnvl) == 0) { - elem = NULL; - while ((elem = nvlist_next_nvpair(hnvl, - elem)) != NULL) { - nvlist_add_string(tmpnvl, - nvpair_name(elem), zc->zc_string); - } - } - nvlist_free(hnvl); - } else { - snapname = kmem_asprintf("%s@%s", zc->zc_name, - zc->zc_value); - nvlist_add_string(tmpnvl, snapname, zc->zc_string); - kmem_free(snapname, strlen(snapname + 1)); - } - fnvlist_add_nvlist(nvl, "holds", tmpnvl); - nvlist_free(tmpnvl); - if (innvl != NULL) - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - case ZFS_IOC_RELEASE: - nvl = fnvlist_alloc(); - tmpnvl = fnvlist_alloc(); - if (zc->zc_cookie) { - hnvl = fnvlist_alloc(); - if (dmu_get_recursive_snaps_nvl(zc->zc_name, - zc->zc_value, hnvl) == 0) { - elem = NULL; - while ((elem = nvlist_next_nvpair(hnvl, - elem)) != NULL) { - fnvlist_add_boolean(tmpnvl, - zc->zc_string); - fnvlist_add_nvlist(nvl, - nvpair_name(elem), tmpnvl); - } - } - nvlist_free(hnvl); - } else { - snapname = kmem_asprintf("%s@%s", zc->zc_name, - zc->zc_value); - fnvlist_add_boolean(tmpnvl, zc->zc_string); - fnvlist_add_nvlist(nvl, snapname, tmpnvl); - kmem_free(snapname, strlen(snapname + 1)); - } - nvlist_free(tmpnvl); - if (innvl != NULL) - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - } -out: - return (innvl); -} - -nvlist_t * -zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t *outnvl, const int vec, - const int cflag) -{ - nvlist_t *tmpnvl; - - if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || - cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) - return (outnvl); - - switch (vec) { - case ZFS_IOC_SPACE_SNAPS: - (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie); - (void) nvlist_lookup_uint64(outnvl, "compressed", - &zc->zc_objset_type); - (void) nvlist_lookup_uint64(outnvl, "uncompressed", - &zc->zc_perm_action); - nvlist_free(outnvl); - /* return empty outnvl */ - tmpnvl = fnvlist_alloc(); - return (tmpnvl); - break; - case ZFS_IOC_CREATE: - case ZFS_IOC_CLONE: - case ZFS_IOC_HOLD: - case ZFS_IOC_RELEASE: - nvlist_free(outnvl); - /* return empty outnvl */ - tmpnvl = fnvlist_alloc(); - return (tmpnvl); - break; - } - - return (outnvl); -} -#endif /* KERNEL */ diff --git a/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_util_os.c b/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_util_os.c index b116f92d97af..2ac31f1077ca 100644 --- a/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_util_os.c +++ b/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_util_os.c @@ -1,215 +1,221 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../../libzfs_impl.h" #include "zfs_prop.h" #include #include #define ZDIFF_SHARESDIR "/.zfs/shares/" int zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) { return (ioctl(hdl->libzfs_fd, request, zc)); } const char * libzfs_error_init(int error) { switch (error) { case ENXIO: return (dgettext(TEXT_DOMAIN, "The ZFS modules are not " "loaded.\nTry running '/sbin/modprobe zfs' as root " "to load them.")); case ENOENT: return (dgettext(TEXT_DOMAIN, "/dev/zfs and /proc/self/mounts " "are required.\nTry running 'udevadm trigger' and 'mount " "-t proc proc /proc' as root.")); case ENOEXEC: return (dgettext(TEXT_DOMAIN, "The ZFS modules cannot be " "auto-loaded.\nTry running '/sbin/modprobe zfs' as " "root to manually load them.")); case EACCES: return (dgettext(TEXT_DOMAIN, "Permission denied the " "ZFS utilities must be run as root.")); default: return (dgettext(TEXT_DOMAIN, "Failed to initialize the " "libzfs library.")); } } static int libzfs_module_loaded(const char *module) { const char path_prefix[] = "/sys/module/"; char path[256]; memcpy(path, path_prefix, sizeof (path_prefix) - 1); strcpy(path + sizeof (path_prefix) - 1, module); return (access(path, F_OK) == 0); } /* * Verify the required ZFS_DEV device is available and optionally attempt * to load the ZFS modules. Under normal circumstances the modules * should already have been loaded by some external mechanism. * * Environment variables: * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV */ static int libzfs_load_module_impl(const char *module) { char *argv[4] = {"/sbin/modprobe", "-q", (char *)module, (char *)0}; char *load_str, *timeout_str; long timeout = 10; /* seconds */ long busy_timeout = 10; /* milliseconds */ int load = 0, fd; hrtime_t start; /* Optionally request module loading */ if (!libzfs_module_loaded(module)) { load_str = getenv("ZFS_MODULE_LOADING"); if (load_str) { if (!strncasecmp(load_str, "YES", strlen("YES")) || !strncasecmp(load_str, "ON", strlen("ON"))) load = 1; else load = 0; } if (load) { if (libzfs_run_process("/sbin/modprobe", argv, 0)) return (ENOEXEC); } if (!libzfs_module_loaded(module)) return (ENXIO); } /* * Device creation by udev is asynchronous and waiting may be * required. Busy wait for 10ms and then fall back to polling every * 10ms for the allowed timeout (default 10s, max 10m). This is * done to optimize for the common case where the device is * immediately available and to avoid penalizing the possible * case where udev is slow or unable to create the device. */ timeout_str = getenv("ZFS_MODULE_TIMEOUT"); if (timeout_str) { timeout = strtol(timeout_str, NULL, 0); timeout = MAX(MIN(timeout, (10 * 60)), 0); /* 0 <= N <= 600 */ } start = gethrtime(); do { fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); if (fd >= 0) { (void) close(fd); return (0); } else if (errno != ENOENT) { return (errno); } else if (NSEC2MSEC(gethrtime() - start) < busy_timeout) { sched_yield(); } else { usleep(10 * MILLISEC); } } while (NSEC2MSEC(gethrtime() - start) < (timeout * MILLISEC)); return (ENOENT); } int libzfs_load_module(void) { return (libzfs_load_module_impl(ZFS_DRIVER)); } int find_shares_object(differ_info_t *di) { char fullpath[MAXPATHLEN]; struct stat64 sb = { 0 }; (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); if (stat64(fullpath, &sb) != 0) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); } di->shares = (uint64_t)sb.st_ino; return (0); } +int +zfs_destroy_snaps_nvl_os(libzfs_handle_t *hdl, nvlist_t *snaps) +{ + return (0); +} + /* * Fill given version buffer with zfs kernel version read from ZFS_SYSFS_DIR * Returns 0 on success, and -1 on error (with errno set) */ int zfs_version_kernel(char *version, int len) { int _errno; int fd; int rlen; if ((fd = open(ZFS_SYSFS_DIR "/version", O_RDONLY | O_CLOEXEC)) == -1) return (-1); if ((rlen = read(fd, version, len)) == -1) { version[0] = '\0'; _errno = errno; (void) close(fd); errno = _errno; return (-1); } version[rlen-1] = '\0'; /* discard '\n' */ if (close(fd) == -1) return (-1); return (0); } diff --git a/sys/contrib/openzfs/lib/libzfs_core/Makefile.am b/sys/contrib/openzfs/lib/libzfs_core/Makefile.am index b2101e21144d..64cb76f1995b 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs_core/Makefile.am @@ -1,37 +1,53 @@ include $(top_srcdir)/config/Rules.am pkgconfig_DATA = libzfs_core.pc AM_CFLAGS += -fvisibility=hidden lib_LTLIBRARIES = libzfs_core.la include $(top_srcdir)/config/Abigail.am USER_C = \ libzfs_core.c +if BUILD_LINUX +USER_C += \ + os/linux/libzfs_core_ioctl.c +endif + +if BUILD_FREEBSD +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs + +USER_C += \ + os/freebsd/libzfs_core_ioctl.c + +VPATH += $(top_srcdir)/module/os/freebsd/zfs + +nodist_libzfs_core_la_SOURCES = zfs_ioctl_compat.c +endif + libzfs_core_la_SOURCES = $(USER_C) libzfs_core_la_LIBADD = \ - $(abs_top_builddir)/lib/libzutil/libzutil.la \ - $(abs_top_builddir)/lib/libnvpair/libnvpair.la + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libspl/libspl.la libzfs_core_la_LIBADD += $(LTLIBINTL) libzfs_core_la_LDFLAGS = -pthread if !ASAN_ENABLED libzfs_core_la_LDFLAGS += -Wl,-z,defs endif if BUILD_FREEBSD libzfs_core_la_LIBADD += -lutil -lgeom endif libzfs_core_la_LDFLAGS += -version-info 3:0:0 include $(top_srcdir)/config/CppCheck.am # Library ABI EXTRA_DIST = libzfs_core.abi libzfs_core.suppr diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi index 6abec9a8ab5b..b3ae682efcdf 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi @@ -1,3061 +1,2159 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + - - - + + + + - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c index ce33b2153062..cbe486d08ba1 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.c @@ -1,1642 +1,1642 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ /* * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. * It has the following characteristics: * * - Thread Safe. libzfs_core is accessible concurrently from multiple * threads. This is accomplished primarily by avoiding global data * (e.g. caching). Since it's thread-safe, there is no reason for a * process to have multiple libzfs "instances". Therefore, we store * our few pieces of data (e.g. the file descriptor) in global * variables. The fd is reference-counted so that the libzfs_core * library can be "initialized" multiple times (e.g. by different * consumers within the same process). * * - Committed Interface. The libzfs_core interface will be committed, * therefore consumers can compile against it and be confident that * their code will continue to work on future releases of this code. * Currently, the interface is Evolving (not Committed), but we intend * to commit to it once it is more complete and we determine that it * meets the needs of all consumers. * * - Programmatic Error Handling. libzfs_core communicates errors with * defined error numbers, and doesn't print anything to stdout/stderr. * * - Thin Layer. libzfs_core is a thin layer, marshaling arguments * to/from the kernel ioctls. There is generally a 1:1 correspondence * between libzfs_core functions and ioctls to ZFS_DEV. * * - Clear Atomicity. Because libzfs_core functions are generally 1:1 * with kernel ioctls, and kernel ioctls are general atomic, each * libzfs_core function is atomic. For example, creating multiple * snapshots with a single call to lzc_snapshot() is atomic -- it * can't fail with only some of the requested snapshots created, even * in the event of power loss or system crash. * * - Continued libzfs Support. Some higher-level operations (e.g. * support for "zfs send -R") are too complicated to fit the scope of * libzfs_core. This functionality will continue to live in libzfs. * Where appropriate, libzfs will use the underlying atomic operations * of libzfs_core. For example, libzfs may implement "zfs send -R | * zfs receive" by using individual "send one snapshot", rename, * destroy, and "receive one snapshot" operations in libzfs_core. * /sbin/zfs and /sbin/zpool will link with both libzfs and * libzfs_core. Other consumers should aim to use only libzfs_core, * since that will be the supported, stable interface going forwards. */ #include #include #include #include #include #ifdef ZFS_DEBUG #include #endif #include #include #include #include #include #include #include #include #include static int g_fd = -1; static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; static int g_refcount; #ifdef ZFS_DEBUG static zfs_ioc_t fail_ioc_cmd = ZFS_IOC_LAST; static zfs_errno_t fail_ioc_err; static void libzfs_core_debug_ioc(void) { /* * To test running newer user space binaries with kernel's * that don't yet support an ioctl or a new ioctl arg we * provide an override to intentionally fail an ioctl. * * USAGE: * The override variable, ZFS_IOC_TEST, is of the form "cmd:err" * * For example, to fail a ZFS_IOC_POOL_CHECKPOINT with a * ZFS_ERR_IOC_CMD_UNAVAIL, the string would be "0x5a4d:1029" * * $ sudo sh -c "ZFS_IOC_TEST=0x5a4d:1029 zpool checkpoint tank" * cannot checkpoint 'tank': the loaded zfs module does not support * this operation. A reboot may be required to enable this operation. */ if (fail_ioc_cmd == ZFS_IOC_LAST) { char *ioc_test = getenv("ZFS_IOC_TEST"); unsigned int ioc_num = 0, ioc_err = 0; if (ioc_test != NULL && sscanf(ioc_test, "%i:%i", &ioc_num, &ioc_err) == 2 && ioc_num < ZFS_IOC_LAST) { fail_ioc_cmd = ioc_num; fail_ioc_err = ioc_err; } } } #endif int libzfs_core_init(void) { (void) pthread_mutex_lock(&g_lock); if (g_refcount == 0) { g_fd = open(ZFS_DEV, O_RDWR|O_CLOEXEC); if (g_fd < 0) { (void) pthread_mutex_unlock(&g_lock); return (errno); } } g_refcount++; #ifdef ZFS_DEBUG libzfs_core_debug_ioc(); #endif (void) pthread_mutex_unlock(&g_lock); return (0); } void libzfs_core_fini(void) { (void) pthread_mutex_lock(&g_lock); ASSERT3S(g_refcount, >, 0); g_refcount--; if (g_refcount == 0 && g_fd != -1) { (void) close(g_fd); g_fd = -1; } (void) pthread_mutex_unlock(&g_lock); } static int lzc_ioctl(zfs_ioc_t ioc, const char *name, nvlist_t *source, nvlist_t **resultp) { zfs_cmd_t zc = {"\0"}; int error = 0; char *packed = NULL; size_t size = 0; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); #ifdef ZFS_DEBUG if (ioc == fail_ioc_cmd) return (fail_ioc_err); #endif if (name != NULL) (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); if (source != NULL) { packed = fnvlist_pack(source, &size); zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; zc.zc_nvlist_src_size = size; } if (resultp != NULL) { *resultp = NULL; if (ioc == ZFS_IOC_CHANNEL_PROGRAM) { zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source, ZCP_ARG_MEMLIMIT); } else { zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); } zc.zc_nvlist_dst = (uint64_t)(uintptr_t) malloc(zc.zc_nvlist_dst_size); if (zc.zc_nvlist_dst == (uint64_t)0) { error = ENOMEM; goto out; } } - while (zfs_ioctl_fd(g_fd, ioc, &zc) != 0) { + while (lzc_ioctl_fd(g_fd, ioc, &zc) != 0) { /* * If ioctl exited with ENOMEM, we retry the ioctl after * increasing the size of the destination nvlist. * * Channel programs that exit with ENOMEM ran over the * lua memory sandbox; they should not be retried. */ if (errno == ENOMEM && resultp != NULL && ioc != ZFS_IOC_CHANNEL_PROGRAM) { free((void *)(uintptr_t)zc.zc_nvlist_dst); zc.zc_nvlist_dst_size *= 2; zc.zc_nvlist_dst = (uint64_t)(uintptr_t) malloc(zc.zc_nvlist_dst_size); if (zc.zc_nvlist_dst == (uint64_t)0) { error = ENOMEM; goto out; } } else { error = errno; break; } } if (zc.zc_nvlist_dst_filled) { *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, zc.zc_nvlist_dst_size); } out: if (packed != NULL) fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zc.zc_nvlist_dst); return (error); } int lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props, uint8_t *wkeydata, uint_t wkeylen) { int error; nvlist_t *hidden_args = NULL; nvlist_t *args = fnvlist_alloc(); fnvlist_add_int32(args, "type", (dmu_objset_type_t)type); if (props != NULL) fnvlist_add_nvlist(args, "props", props); if (wkeydata != NULL) { hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(args, ZPOOL_HIDDEN_ARGS, hidden_args); } error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); nvlist_free(hidden_args); nvlist_free(args); return (error); } int lzc_clone(const char *fsname, const char *origin, nvlist_t *props) { int error; nvlist_t *hidden_args = NULL; nvlist_t *args = fnvlist_alloc(); fnvlist_add_string(args, "origin", origin); if (props != NULL) fnvlist_add_nvlist(args, "props", props); error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); nvlist_free(hidden_args); nvlist_free(args); return (error); } int lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) { /* * The promote ioctl is still legacy, so we need to construct our * own zfs_cmd_t rather than using lzc_ioctl(). */ zfs_cmd_t zc = {"\0"}; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); - if (zfs_ioctl_fd(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) { + if (lzc_ioctl_fd(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) { int error = errno; if (error == EEXIST && snapnamebuf != NULL) (void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen); return (error); } return (0); } int lzc_rename(const char *source, const char *target) { zfs_cmd_t zc = {"\0"}; int error; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, source, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - error = zfs_ioctl_fd(g_fd, ZFS_IOC_RENAME, &zc); + error = lzc_ioctl_fd(g_fd, ZFS_IOC_RENAME, &zc); if (error != 0) error = errno; return (error); } int lzc_destroy(const char *fsname) { int error; nvlist_t *args = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_DESTROY, fsname, args, NULL); nvlist_free(args); return (error); } /* * Creates snapshots. * * The keys in the snaps nvlist are the snapshots to be created. * They must all be in the same pool. * * The props nvlist is properties to set. Currently only user properties * are supported. { user:prop_name -> string value } * * The returned results nvlist will have an entry for each snapshot that failed. * The value will be the (int32) error code. * * The return value will be 0 if all snapshots were created, otherwise it will * be the errno of a (unspecified) snapshot that failed. */ int lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) { nvpair_t *elem; nvlist_t *args; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; *errlist = NULL; /* determine the pool name */ elem = nvlist_next_nvpair(snaps, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; args = fnvlist_alloc(); fnvlist_add_nvlist(args, "snaps", snaps); if (props != NULL) fnvlist_add_nvlist(args, "props", props); error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); nvlist_free(args); return (error); } /* * Destroys snapshots. * * The keys in the snaps nvlist are the snapshots to be destroyed. * They must all be in the same pool. * * Snapshots that do not exist will be silently ignored. * * If 'defer' is not set, and a snapshot has user holds or clones, the * destroy operation will fail and none of the snapshots will be * destroyed. * * If 'defer' is set, and a snapshot has user holds or clones, it will be * marked for deferred destruction, and will be destroyed when the last hold * or clone is removed/destroyed. * * The return value will be 0 if all snapshots were destroyed (or marked for * later destruction if 'defer' is set) or didn't exist to begin with. * * Otherwise the return value will be the errno of a (unspecified) snapshot * that failed, no snapshots will be destroyed, and the errlist will have an * entry for each snapshot that failed. The value in the errlist will be * the (int32) error code. */ int lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) { nvpair_t *elem; nvlist_t *args; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; /* determine the pool name */ elem = nvlist_next_nvpair(snaps, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; args = fnvlist_alloc(); fnvlist_add_nvlist(args, "snaps", snaps); if (defer) fnvlist_add_boolean(args, "defer"); error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); nvlist_free(args); return (error); } int lzc_snaprange_space(const char *firstsnap, const char *lastsnap, uint64_t *usedp) { nvlist_t *args; nvlist_t *result; int err; char fs[ZFS_MAX_DATASET_NAME_LEN]; char *atp; /* determine the fs name */ (void) strlcpy(fs, firstsnap, sizeof (fs)); atp = strchr(fs, '@'); if (atp == NULL) return (EINVAL); *atp = '\0'; args = fnvlist_alloc(); fnvlist_add_string(args, "firstsnap", firstsnap); err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); nvlist_free(args); if (err == 0) *usedp = fnvlist_lookup_uint64(result, "used"); fnvlist_free(result); return (err); } boolean_t lzc_exists(const char *dataset) { /* * The objset_stats ioctl is still legacy, so we need to construct our * own zfs_cmd_t rather than using lzc_ioctl(). */ zfs_cmd_t zc = {"\0"}; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - return (zfs_ioctl_fd(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); + return (lzc_ioctl_fd(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); } /* * outnvl is unused. * It was added to preserve the function signature in case it is * needed in the future. */ /*ARGSUSED*/ int lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL)); } /* * Create "user holds" on snapshots. If there is a hold on a snapshot, * the snapshot can not be destroyed. (However, it can be marked for deletion * by lzc_destroy_snaps(defer=B_TRUE).) * * The keys in the nvlist are snapshot names. * The snapshots must all be in the same pool. * The value is the name of the hold (string type). * * If cleanup_fd is not -1, it must be the result of open(ZFS_DEV, O_EXCL). * In this case, when the cleanup_fd is closed (including on process * termination), the holds will be released. If the system is shut down * uncleanly, the holds will be released when the pool is next opened * or imported. * * Holds for snapshots which don't exist will be skipped and have an entry * added to errlist, but will not cause an overall failure. * * The return value will be 0 if all holds, for snapshots that existed, * were successfully created. * * Otherwise the return value will be the errno of a (unspecified) hold that * failed and no holds will be created. * * In all cases the errlist will have an entry for each hold that failed * (name = snapshot), with its value being the error code (int32). */ int lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) { char pool[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *args; nvpair_t *elem; int error; /* determine the pool name */ elem = nvlist_next_nvpair(holds, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; args = fnvlist_alloc(); fnvlist_add_nvlist(args, "holds", holds); if (cleanup_fd != -1) fnvlist_add_int32(args, "cleanup_fd", cleanup_fd); error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist); nvlist_free(args); return (error); } /* * Release "user holds" on snapshots. If the snapshot has been marked for * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have * any clones, and all the user holds are removed, then the snapshot will be * destroyed. * * The keys in the nvlist are snapshot names. * The snapshots must all be in the same pool. * The value is an nvlist whose keys are the holds to remove. * * Holds which failed to release because they didn't exist will have an entry * added to errlist, but will not cause an overall failure. * * The return value will be 0 if the nvl holds was empty or all holds that * existed, were successfully removed. * * Otherwise the return value will be the errno of a (unspecified) hold that * failed to release and no holds will be released. * * In all cases the errlist will have an entry for each hold that failed to * to release. */ int lzc_release(nvlist_t *holds, nvlist_t **errlist) { char pool[ZFS_MAX_DATASET_NAME_LEN]; nvpair_t *elem; /* determine the pool name */ elem = nvlist_next_nvpair(holds, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist)); } /* * Retrieve list of user holds on the specified snapshot. * * On success, *holdsp will be set to an nvlist which the caller must free. * The keys are the names of the holds, and the value is the creation time * of the hold (uint64) in seconds since the epoch. */ int lzc_get_holds(const char *snapname, nvlist_t **holdsp) { return (lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, NULL, holdsp)); } /* * Generate a zfs send stream for the specified snapshot and write it to * the specified file descriptor. * * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap") * * If "from" is NULL, a full (non-incremental) stream will be sent. * If "from" is non-NULL, it must be the full name of a snapshot or * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or * "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or * bookmark must represent an earlier point in the history of "snapname"). * It can be an earlier snapshot in the same filesystem or zvol as "snapname", * or it can be the origin of "snapname"'s filesystem, or an earlier * snapshot in the origin, etc. * * "fd" is the file descriptor to write the send stream to. * * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT * records with drr_blksz > 128K. * * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA, * which the receiving system must support (as indicated by support * for the "embedded_data" feature). * * If "flags" contains LZC_SEND_FLAG_COMPRESS, the stream is generated by using * compressed WRITE records for blocks which are compressed on disk and in * memory. If the lz4_compress feature is active on the sending system, then * the receiving system must have that feature enabled as well. * * If "flags" contains LZC_SEND_FLAG_RAW, the stream is generated, for encrypted * datasets, by sending data exactly as it exists on disk. This allows backups * to be taken even if encryption keys are not currently loaded. */ int lzc_send(const char *snapname, const char *from, int fd, enum lzc_send_flags flags) { return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, NULL)); } int lzc_send_redacted(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, const char *redactbook) { return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, redactbook)); } int lzc_send_resume(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) { return (lzc_send_resume_redacted(snapname, from, fd, flags, resumeobj, resumeoff, NULL)); } /* * snapname: The name of the "tosnap", or the snapshot whose contents we are * sending. * from: The name of the "fromsnap", or the incremental source. * fd: File descriptor to write the stream to. * flags: flags that determine features to be used by the stream. * resumeobj: Object to resume from, for resuming send * resumeoff: Offset to resume from, for resuming send. * redactnv: nvlist of string -> boolean(ignored) containing the names of all * the snapshots that we should redact with respect to. * redactbook: Name of the redaction bookmark to create. */ int lzc_send_resume_redacted(const char *snapname, const char *from, int fd, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, const char *redactbook) { nvlist_t *args; int err; args = fnvlist_alloc(); fnvlist_add_int32(args, "fd", fd); if (from != NULL) fnvlist_add_string(args, "fromsnap", from); if (flags & LZC_SEND_FLAG_LARGE_BLOCK) fnvlist_add_boolean(args, "largeblockok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); if (flags & LZC_SEND_FLAG_COMPRESS) fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_RAW) fnvlist_add_boolean(args, "rawok"); if (flags & LZC_SEND_FLAG_SAVED) fnvlist_add_boolean(args, "savedok"); if (resumeobj != 0 || resumeoff != 0) { fnvlist_add_uint64(args, "resume_object", resumeobj); fnvlist_add_uint64(args, "resume_offset", resumeoff); } if (redactbook != NULL) fnvlist_add_string(args, "redactbook", redactbook); err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); nvlist_free(args); return (err); } /* * "from" can be NULL, a snapshot, or a bookmark. * * If from is NULL, a full (non-incremental) stream will be estimated. This * is calculated very efficiently. * * If from is a snapshot, lzc_send_space uses the deadlists attached to * each snapshot to efficiently estimate the stream size. * * If from is a bookmark, the indirect blocks in the destination snapshot * are traversed, looking for blocks with a birth time since the creation TXG of * the snapshot this bookmark was created from. This will result in * significantly more I/O and be less efficient than a send space estimation on * an equivalent snapshot. This process is also used if redact_snaps is * non-null. */ int lzc_send_space_resume_redacted(const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, uint64_t resume_bytes, const char *redactbook, int fd, uint64_t *spacep) { nvlist_t *args; nvlist_t *result; int err; args = fnvlist_alloc(); if (from != NULL) fnvlist_add_string(args, "from", from); if (flags & LZC_SEND_FLAG_LARGE_BLOCK) fnvlist_add_boolean(args, "largeblockok"); if (flags & LZC_SEND_FLAG_EMBED_DATA) fnvlist_add_boolean(args, "embedok"); if (flags & LZC_SEND_FLAG_COMPRESS) fnvlist_add_boolean(args, "compressok"); if (flags & LZC_SEND_FLAG_RAW) fnvlist_add_boolean(args, "rawok"); if (resumeobj != 0 || resumeoff != 0) { fnvlist_add_uint64(args, "resume_object", resumeobj); fnvlist_add_uint64(args, "resume_offset", resumeoff); fnvlist_add_uint64(args, "bytes", resume_bytes); } if (redactbook != NULL) fnvlist_add_string(args, "redactbook", redactbook); if (fd != -1) fnvlist_add_int32(args, "fd", fd); err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); nvlist_free(args); if (err == 0) *spacep = fnvlist_lookup_uint64(result, "space"); nvlist_free(result); return (err); } int lzc_send_space(const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t *spacep) { return (lzc_send_space_resume_redacted(snapname, from, flags, 0, 0, 0, NULL, -1, spacep)); } static int recv_read(int fd, void *buf, int ilen) { char *cp = buf; int rv; int len = ilen; do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) return (EIO); return (0); } /* * Linux adds ZFS_IOC_RECV_NEW for resumable and raw streams and preserves the * legacy ZFS_IOC_RECV user/kernel interface. The new interface supports all * stream options but is currently only used for resumable streams. This way * updated user space utilities will interoperate with older kernel modules. * * Non-Linux OpenZFS platforms have opted to modify the legacy interface. */ static int recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_replay_record_t drr; char fsname[MAXPATHLEN]; char *atp; int error; boolean_t payload = B_FALSE; ASSERT3S(g_refcount, >, 0); VERIFY3S(g_fd, !=, -1); /* Set 'fsname' to the name of containing filesystem */ (void) strlcpy(fsname, snapname, sizeof (fsname)); atp = strchr(fsname, '@'); if (atp == NULL) return (EINVAL); *atp = '\0'; /* If the fs does not exist, try its parent. */ if (!lzc_exists(fsname)) { char *slashp = strrchr(fsname, '/'); if (slashp == NULL) return (ENOENT); *slashp = '\0'; } /* * The begin_record is normally a non-byteswapped BEGIN record. * For resumable streams it may be set to any non-byteswapped * dmu_replay_record_t. */ if (begin_record == NULL) { error = recv_read(input_fd, &drr, sizeof (drr)); if (error != 0) return (error); } else { drr = *begin_record; payload = (begin_record->drr_payloadlen != 0); } /* * All receives with a payload should use the new interface. */ if (resumable || raw || wkeydata != NULL || payload) { nvlist_t *outnvl = NULL; nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_string(innvl, "snapname", snapname); if (recvdprops != NULL) fnvlist_add_nvlist(innvl, "props", recvdprops); if (localprops != NULL) fnvlist_add_nvlist(innvl, "localprops", localprops); if (wkeydata != NULL) { /* * wkeydata must be placed in the special * ZPOOL_HIDDEN_ARGS nvlist so that it * will not be printed to the zpool history. */ nvlist_t *hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(innvl, ZPOOL_HIDDEN_ARGS, hidden_args); nvlist_free(hidden_args); } if (origin != NULL && strlen(origin)) fnvlist_add_string(innvl, "origin", origin); fnvlist_add_byte_array(innvl, "begin_record", (uchar_t *)&drr, sizeof (drr)); fnvlist_add_int32(innvl, "input_fd", input_fd); if (force) fnvlist_add_boolean(innvl, "force"); if (resumable) fnvlist_add_boolean(innvl, "resumable"); error = lzc_ioctl(ZFS_IOC_RECV_NEW, fsname, innvl, &outnvl); if (error == 0 && read_bytes != NULL) error = nvlist_lookup_uint64(outnvl, "read_bytes", read_bytes); if (error == 0 && errflags != NULL) error = nvlist_lookup_uint64(outnvl, "error_flags", errflags); if (error == 0 && errors != NULL) { nvlist_t *nvl; error = nvlist_lookup_nvlist(outnvl, "errors", &nvl); if (error == 0) *errors = fnvlist_dup(nvl); } fnvlist_free(innvl); fnvlist_free(outnvl); } else { zfs_cmd_t zc = {"\0"}; char *packed = NULL; size_t size; ASSERT3S(g_refcount, >, 0); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); if (recvdprops != NULL) { packed = fnvlist_pack(recvdprops, &size); zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; zc.zc_nvlist_src_size = size; } if (localprops != NULL) { packed = fnvlist_pack(localprops, &size); zc.zc_nvlist_conf = (uint64_t)(uintptr_t)packed; zc.zc_nvlist_conf_size = size; } if (origin != NULL) (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); ASSERT3S(drr.drr_type, ==, DRR_BEGIN); zc.zc_begin_record = drr.drr_u.drr_begin; zc.zc_guid = force; zc.zc_cookie = input_fd; zc.zc_cleanup_fd = -1; zc.zc_action_handle = 0; zc.zc_nvlist_dst_size = 128 * 1024; zc.zc_nvlist_dst = (uint64_t)(uintptr_t) malloc(zc.zc_nvlist_dst_size); - error = zfs_ioctl_fd(g_fd, ZFS_IOC_RECV, &zc); + error = lzc_ioctl_fd(g_fd, ZFS_IOC_RECV, &zc); if (error != 0) { error = errno; } else { if (read_bytes != NULL) *read_bytes = zc.zc_cookie; if (errflags != NULL) *errflags = zc.zc_obj; if (errors != NULL) VERIFY0(nvlist_unpack( (void *)(uintptr_t)zc.zc_nvlist_dst, zc.zc_nvlist_dst_size, errors, KM_SLEEP)); } if (packed != NULL) fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zc.zc_nvlist_dst); } return (error); } /* * The simplest receive case: receive from the specified fd, creating the * specified snapshot. Apply the specified properties as "received" properties * (which can be overridden by locally-set properties). If the stream is a * clone, its origin snapshot must be specified by 'origin'. The 'force' * flag will cause the target filesystem to be rolled back or destroyed if * necessary to receive. * * Return 0 on success or an errno on failure. * * Note: this interface does not work on dedup'd streams * (those with DMU_BACKUP_FEATURE_DEDUP). */ int lzc_receive(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, B_FALSE, raw, fd, NULL, NULL, NULL, NULL)); } /* * Like lzc_receive, but if the receive fails due to premature stream * termination, the intermediate state will be preserved on disk. In this * case, ECKSUM will be returned. The receive may subsequently be resumed * with a resuming send stream generated by lzc_send_resume(). */ int lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, B_TRUE, raw, fd, NULL, NULL, NULL, NULL)); } /* * Like lzc_receive, but allows the caller to read the begin record and then to * pass it in. That could be useful if the caller wants to derive, for example, * the snapname or the origin parameters based on the information contained in * the begin record. * The begin record must be in its original form as read from the stream, * in other words, it should not be byteswapped. * * The 'resumable' parameter allows to obtain the same behavior as with * lzc_receive_resumable. */ int lzc_receive_with_header(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int fd, const dmu_replay_record_t *begin_record) { if (begin_record == NULL) return (EINVAL); return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, resumable, raw, fd, begin_record, NULL, NULL, NULL)); } /* * Like lzc_receive, but allows the caller to pass all supported arguments * and retrieve all values returned. The only additional input parameter * is 'cleanup_fd' which is used to set a cleanup-on-exit file descriptor. * * The following parameters all provide return values. Several may be set * in the failure case and will contain additional information. * * The 'read_bytes' value will be set to the total number of bytes read. * * The 'errflags' value will contain zprop_errflags_t flags which are * used to describe any failures. * * The 'action_handle' and 'cleanup_fd' are no longer used, and are ignored. * * The 'errors' nvlist contains an entry for each unapplied received * property. Callers are responsible for freeing this nvlist. */ int lzc_receive_one(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, resumable, raw, input_fd, begin_record, read_bytes, errflags, errors)); } /* * Like lzc_receive_one, but allows the caller to pass an additional 'cmdprops' * argument. * * The 'cmdprops' nvlist contains both override ('zfs receive -o') and * exclude ('zfs receive -x') properties. Callers are responsible for freeing * this nvlist */ int lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, force, resumable, raw, input_fd, begin_record, read_bytes, errflags, errors)); } /* * Roll back this filesystem or volume to its most recent snapshot. * If snapnamebuf is not NULL, it will be filled in with the name * of the most recent snapshot. * Note that the latest snapshot may change if a new one is concurrently * created or the current one is destroyed. lzc_rollback_to can be used * to roll back to a specific latest snapshot. * * Return 0 on success or an errno on failure. */ int lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen) { nvlist_t *args; nvlist_t *result; int err; args = fnvlist_alloc(); err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); nvlist_free(args); if (err == 0 && snapnamebuf != NULL) { const char *snapname = fnvlist_lookup_string(result, "target"); (void) strlcpy(snapnamebuf, snapname, snapnamelen); } nvlist_free(result); return (err); } /* * Roll back this filesystem or volume to the specified snapshot, * if possible. * * Return 0 on success or an errno on failure. */ int lzc_rollback_to(const char *fsname, const char *snapname) { nvlist_t *args; nvlist_t *result; int err; args = fnvlist_alloc(); fnvlist_add_string(args, "target", snapname); err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); nvlist_free(args); nvlist_free(result); return (err); } /* * Creates new bookmarks from existing snapshot or bookmark. * * The bookmarks nvlist maps from the full name of the new bookmark to * the full name of the source snapshot or bookmark. * All the bookmarks and snapshots must be in the same pool. * The new bookmarks names must be unique. * => see function dsl_bookmark_create_nvl_validate * * The returned results nvlist will have an entry for each bookmark that failed. * The value will be the (int32) error code. * * The return value will be 0 if all bookmarks were created, otherwise it will * be the errno of a (undetermined) bookmarks that failed. */ int lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) { nvpair_t *elem; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; /* determine pool name from first bookmark */ elem = nvlist_next_nvpair(bookmarks, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/#")] = '\0'; error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist); return (error); } /* * Retrieve bookmarks. * * Retrieve the list of bookmarks for the given file system. The props * parameter is an nvlist of property names (with no values) that will be * returned for each bookmark. * * The following are valid properties on bookmarks, most of which are numbers * (represented as uint64 in the nvlist), except redact_snaps, which is a * uint64 array, and redact_complete, which is a boolean * * "guid" - globally unique identifier of the snapshot it refers to * "createtxg" - txg when the snapshot it refers to was created * "creation" - timestamp when the snapshot it refers to was created * "ivsetguid" - IVset guid for identifying encrypted snapshots * "redact_snaps" - list of guids of the redaction snapshots for the specified * bookmark. If the bookmark is not a redaction bookmark, the nvlist will * not contain an entry for this value. If it is redacted with respect to * no snapshots, it will contain value -> NULL uint64 array * "redact_complete" - boolean value; true if the redaction bookmark is * complete, false otherwise. * * The format of the returned nvlist as follows: * -> { * -> { * "value" -> uint64 * } * ... * "redact_snaps" -> { * "value" -> uint64 array * } * "redact_complete" -> { * "value" -> boolean value * } * } */ int lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks) { return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks)); } /* * Get bookmark properties. * * Given a bookmark's full name, retrieve all properties for the bookmark. * * The format of the returned property list is as follows: * { * -> { * "value" -> uint64 * } * ... * "redact_snaps" -> { * "value" -> uint64 array * } */ int lzc_get_bookmark_props(const char *bookmark, nvlist_t **props) { int error; nvlist_t *innvl = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, innvl, props); fnvlist_free(innvl); return (error); } /* * Destroys bookmarks. * * The keys in the bmarks nvlist are the bookmarks to be destroyed. * They must all be in the same pool. Bookmarks are specified as * #. * * Bookmarks that do not exist will be silently ignored. * * The return value will be 0 if all bookmarks that existed were destroyed. * * Otherwise the return value will be the errno of a (undetermined) bookmark * that failed, no bookmarks will be destroyed, and the errlist will have an * entry for each bookmarks that failed. The value in the errlist will be * the (int32) error code. */ int lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist) { nvpair_t *elem; int error; char pool[ZFS_MAX_DATASET_NAME_LEN]; /* determine the pool name */ elem = nvlist_next_nvpair(bmarks, NULL); if (elem == NULL) return (0); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/#")] = '\0'; error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist); return (error); } static int lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync, uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) { int error; nvlist_t *args; args = fnvlist_alloc(); fnvlist_add_string(args, ZCP_ARG_PROGRAM, program); fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl); fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync); fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit); fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit); error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl); fnvlist_free(args); return (error); } /* * Executes a channel program. * * If this function returns 0 the channel program was successfully loaded and * ran without failing. Note that individual commands the channel program ran * may have failed and the channel program is responsible for reporting such * errors through outnvl if they are important. * * This method may also return: * * EINVAL The program contains syntax errors, or an invalid memory or time * limit was given. No part of the channel program was executed. * If caused by syntax errors, 'outnvl' contains information about the * errors. * * ECHRNG The program was executed, but encountered a runtime error, such as * calling a function with incorrect arguments, invoking the error() * function directly, failing an assert() command, etc. Some portion * of the channel program may have executed and committed changes. * Information about the failure can be found in 'outnvl'. * * ENOMEM The program fully executed, but the output buffer was not large * enough to store the returned value. No output is returned through * 'outnvl'. * * ENOSPC The program was terminated because it exceeded its memory usage * limit. Some portion of the channel program may have executed and * committed changes to disk. No output is returned through 'outnvl'. * * ETIME The program was terminated because it exceeded its Lua instruction * limit. Some portion of the channel program may have executed and * committed changes to disk. No output is returned through 'outnvl'. */ int lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) { return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit, memlimit, argnvl, outnvl)); } /* * Creates a checkpoint for the specified pool. * * If this function returns 0 the pool was successfully checkpointed. * * This method may also return: * * ZFS_ERR_CHECKPOINT_EXISTS * The pool already has a checkpoint. A pools can only have one * checkpoint at most, at any given time. * * ZFS_ERR_DISCARDING_CHECKPOINT * ZFS is in the middle of discarding a checkpoint for this pool. * The pool can be checkpointed again once the discard is done. * * ZFS_DEVRM_IN_PROGRESS * A vdev is currently being removed. The pool cannot be * checkpointed until the device removal is done. * * ZFS_VDEV_TOO_BIG * One or more top-level vdevs exceed the maximum vdev size * supported for this feature. */ int lzc_pool_checkpoint(const char *pool) { int error; nvlist_t *result = NULL; nvlist_t *args = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result); fnvlist_free(args); fnvlist_free(result); return (error); } /* * Discard the checkpoint from the specified pool. * * If this function returns 0 the checkpoint was successfully discarded. * * This method may also return: * * ZFS_ERR_NO_CHECKPOINT * The pool does not have a checkpoint. * * ZFS_ERR_DISCARDING_CHECKPOINT * ZFS is already in the middle of discarding the checkpoint. */ int lzc_pool_checkpoint_discard(const char *pool) { int error; nvlist_t *result = NULL; nvlist_t *args = fnvlist_alloc(); error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result); fnvlist_free(args); fnvlist_free(result); return (error); } /* * Executes a read-only channel program. * * A read-only channel program works programmatically the same way as a * normal channel program executed with lzc_channel_program(). The only * difference is it runs exclusively in open-context and therefore can * return faster. The downside to that, is that the program cannot change * on-disk state by calling functions from the zfs.sync submodule. * * The return values of this function (and their meaning) are exactly the * same as the ones described in lzc_channel_program(). */ int lzc_channel_program_nosync(const char *pool, const char *program, uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) { return (lzc_channel_program_impl(pool, program, B_FALSE, timeout, memlimit, argnvl, outnvl)); } /* * Performs key management functions * * crypto_cmd should be a value from dcp_cmd_t. If the command specifies to * load or change a wrapping key, the key should be specified in the * hidden_args nvlist so that it is not logged. */ int lzc_load_key(const char *fsname, boolean_t noop, uint8_t *wkeydata, uint_t wkeylen) { int error; nvlist_t *ioc_args; nvlist_t *hidden_args; if (wkeydata == NULL) return (EINVAL); ioc_args = fnvlist_alloc(); hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args); if (noop) fnvlist_add_boolean(ioc_args, "noop"); error = lzc_ioctl(ZFS_IOC_LOAD_KEY, fsname, ioc_args, NULL); nvlist_free(hidden_args); nvlist_free(ioc_args); return (error); } int lzc_unload_key(const char *fsname) { return (lzc_ioctl(ZFS_IOC_UNLOAD_KEY, fsname, NULL, NULL)); } int lzc_change_key(const char *fsname, uint64_t crypt_cmd, nvlist_t *props, uint8_t *wkeydata, uint_t wkeylen) { int error; nvlist_t *ioc_args = fnvlist_alloc(); nvlist_t *hidden_args = NULL; fnvlist_add_uint64(ioc_args, "crypt_cmd", crypt_cmd); if (wkeydata != NULL) { hidden_args = fnvlist_alloc(); fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args); } if (props != NULL) fnvlist_add_nvlist(ioc_args, "props", props); error = lzc_ioctl(ZFS_IOC_CHANGE_KEY, fsname, ioc_args, NULL); nvlist_free(hidden_args); nvlist_free(ioc_args); return (error); } int lzc_reopen(const char *pool_name, boolean_t scrub_restart) { nvlist_t *args = fnvlist_alloc(); int error; fnvlist_add_boolean_value(args, "scrub_restart", scrub_restart); error = lzc_ioctl(ZFS_IOC_POOL_REOPEN, pool_name, args, NULL); nvlist_free(args); return (error); } /* * Changes initializing state. * * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. * The key is ignored. * * If there are errors related to vdev arguments, per-vdev errors are returned * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where * guid is stringified with PRIu64, and errno is one of the following as * an int64_t: * - ENODEV if the device was not found * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) * - EROFS if the device is not writeable * - EBUSY start requested but the device is already being either * initialized or trimmed * - ESRCH cancel/suspend requested but device is not being initialized * * If the errlist is empty, then return value will be: * - EINVAL if one or more arguments was invalid * - Other spa_open failures * - 0 if the operation succeeded */ int lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type, nvlist_t *vdevs, nvlist_t **errlist) { int error; nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type); fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs); error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist); fnvlist_free(args); return (error); } /* * Changes TRIM state. * * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. * The key is ignored. * * If there are errors related to vdev arguments, per-vdev errors are returned * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where * guid is stringified with PRIu64, and errno is one of the following as * an int64_t: * - ENODEV if the device was not found * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) * - EROFS if the device is not writeable * - EBUSY start requested but the device is already being either trimmed * or initialized * - ESRCH cancel/suspend requested but device is not being initialized * - EOPNOTSUPP if the device does not support TRIM (or secure TRIM) * * If the errlist is empty, then return value will be: * - EINVAL if one or more arguments was invalid * - Other spa_open failures * - 0 if the operation succeeded */ int lzc_trim(const char *poolname, pool_trim_func_t cmd_type, uint64_t rate, boolean_t secure, nvlist_t *vdevs, nvlist_t **errlist) { int error; nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, ZPOOL_TRIM_COMMAND, (uint64_t)cmd_type); fnvlist_add_nvlist(args, ZPOOL_TRIM_VDEVS, vdevs); fnvlist_add_uint64(args, ZPOOL_TRIM_RATE, rate); fnvlist_add_boolean_value(args, ZPOOL_TRIM_SECURE, secure); error = lzc_ioctl(ZFS_IOC_POOL_TRIM, poolname, args, errlist); fnvlist_free(args); return (error); } /* * Create a redaction bookmark named bookname by redacting snapshot with respect * to all the snapshots in snapnv. */ int lzc_redact(const char *snapshot, const char *bookname, nvlist_t *snapnv) { nvlist_t *args = fnvlist_alloc(); fnvlist_add_string(args, "bookname", bookname); fnvlist_add_nvlist(args, "snapnv", snapnv); int error = lzc_ioctl(ZFS_IOC_REDACT, snapshot, args, NULL); fnvlist_free(args); return (error); } static int wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { nvlist_t *args = fnvlist_alloc(); nvlist_t *result = NULL; fnvlist_add_int32(args, ZPOOL_WAIT_ACTIVITY, activity); if (use_tag) fnvlist_add_uint64(args, ZPOOL_WAIT_TAG, tag); int error = lzc_ioctl(ZFS_IOC_WAIT, pool, args, &result); if (error == 0 && waited != NULL) *waited = fnvlist_lookup_boolean_value(result, ZPOOL_WAIT_WAITED); fnvlist_free(args); fnvlist_free(result); return (error); } int lzc_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (wait_common(pool, activity, B_FALSE, 0, waited)); } int lzc_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (wait_common(pool, activity, B_TRUE, tag, waited)); } int lzc_wait_fs(const char *fs, zfs_wait_activity_t activity, boolean_t *waited) { nvlist_t *args = fnvlist_alloc(); nvlist_t *result = NULL; fnvlist_add_int32(args, ZFS_WAIT_ACTIVITY, activity); int error = lzc_ioctl(ZFS_IOC_WAIT_FS, fs, args, &result); if (error == 0 && waited != NULL) *waited = fnvlist_lookup_boolean_value(result, ZFS_WAIT_WAITED); fnvlist_free(args); fnvlist_free(result); return (error); } /* * Set the bootenv contents for the given pool. */ int lzc_set_bootenv(const char *pool, const nvlist_t *env) { return (lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, (nvlist_t *)env, NULL)); } /* * Get the contents of the bootenv of the given pool. */ int lzc_get_bootenv(const char *pool, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); } diff --git a/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_compat.c b/sys/contrib/openzfs/lib/libzfs_core/os/freebsd/libzfs_core_ioctl.c similarity index 97% rename from sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_compat.c rename to sys/contrib/openzfs/lib/libzfs_core/os/freebsd/libzfs_core_ioctl.c index baaf4b598ab1..b8394886d034 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_compat.c +++ b/sys/contrib/openzfs/lib/libzfs_core/os/freebsd/libzfs_core_ioctl.c @@ -1,124 +1,123 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include -#include - #include +#include int zfs_ioctl_version = ZFS_IOCVER_UNDEF; /* * Get zfs_ioctl_version */ static int get_zfs_ioctl_version(void) { size_t ver_size; int ver = ZFS_IOCVER_NONE; ver_size = sizeof (ver); sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); return (ver); } static int zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) { int newrequest, ret; void *zc_c = NULL; unsigned long ncmd; zfs_iocparm_t zp; switch (cflag) { case ZFS_CMD_COMPAT_NONE: ncmd = _IOWR('Z', request, zfs_iocparm_t); zp.zfs_cmd = (uint64_t)(uintptr_t)zc; zp.zfs_cmd_size = sizeof (zfs_cmd_t); zp.zfs_ioctl_version = ZFS_IOCVER_OZFS; break; case ZFS_CMD_COMPAT_LEGACY: newrequest = zfs_ioctl_ozfs_to_legacy(request); ncmd = _IOWR('Z', newrequest, zfs_iocparm_t); zc_c = malloc(sizeof (zfs_cmd_legacy_t)); zfs_cmd_ozfs_to_legacy(zc, zc_c); zp.zfs_cmd = (uint64_t)(uintptr_t)zc_c; zp.zfs_cmd_size = sizeof (zfs_cmd_legacy_t); zp.zfs_ioctl_version = ZFS_IOCVER_LEGACY; break; default: abort(); return (EINVAL); } ret = ioctl(fd, ncmd, &zp); if (ret) { if (zc_c) free(zc_c); return (ret); } if (zc_c) { zfs_cmd_legacy_to_ozfs(zc_c, zc); free(zc_c); } return (ret); } /* * This is FreeBSD version of ioctl, because Solaris' ioctl() updates * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an * error is returned zc_nvlist_dst_size won't be updated. */ int -zfs_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +lzc_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) { size_t oldsize; int ret, cflag = ZFS_CMD_COMPAT_NONE; if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) zfs_ioctl_version = get_zfs_ioctl_version(); switch (zfs_ioctl_version) { case ZFS_IOCVER_LEGACY: cflag = ZFS_CMD_COMPAT_LEGACY; break; case ZFS_IOCVER_OZFS: cflag = ZFS_CMD_COMPAT_NONE; break; default: errx(1, "unrecognized zfs ioctl version %d", zfs_ioctl_version); } oldsize = zc->zc_nvlist_dst_size; ret = zcmd_ioctl_compat(fd, request, zc, cflag); if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { ret = -1; errno = ENOMEM; } return (ret); } diff --git a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_compat.c b/sys/contrib/openzfs/lib/libzfs_core/os/linux/libzfs_core_ioctl.c similarity index 91% rename from sys/contrib/openzfs/lib/libzutil/os/linux/zutil_compat.c rename to sys/contrib/openzfs/lib/libzfs_core/os/linux/libzfs_core_ioctl.c index 173ae9cb619c..9b44a4e3be0d 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_compat.c +++ b/sys/contrib/openzfs/lib/libzfs_core/os/linux/libzfs_core_ioctl.c @@ -1,30 +1,30 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include -#include +#include int -zfs_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +lzc_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) { return (ioctl(fd, request, zc)); } diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am index c9a55591e5ca..3cc0c2f2ec05 100644 --- a/sys/contrib/openzfs/lib/libzpool/Makefile.am +++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am @@ -1,240 +1,240 @@ include $(top_srcdir)/config/Rules.am VPATH = \ $(top_srcdir)/module/zfs \ $(top_srcdir)/module/zcommon \ $(top_srcdir)/module/lua \ $(top_srcdir)/module/os/linux/zfs \ $(top_srcdir)/lib/libzpool if BUILD_FREEBSD DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs endif if BUILD_LINUX DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/linux/zfs endif # Unconditionally enable debugging for libzpool AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) # Includes kernel code generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) AM_CFLAGS += $(ZLIB_CFLAGS) AM_CFLAGS += -DLIB_ZPOOL_BUILD lib_LTLIBRARIES = libzpool.la USER_C = \ kernel.c \ taskq.c \ util.c KERNEL_C = \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ zfs_fletcher.c \ zfs_fletcher_aarch64_neon.c \ zfs_fletcher_avx512.c \ zfs_fletcher_intel.c \ zfs_fletcher_sse.c \ zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ zpool_prop.c \ zprop_common.c \ abd.c \ abd_os.c \ aggsum.c \ arc.c \ arc_os.c \ blkptr.c \ bplist.c \ bpobj.c \ bptree.c \ btree.c \ bqueue.c \ cityhash.c \ dbuf.c \ dbuf_stats.c \ ddt.c \ ddt_zap.c \ dmu.c \ dmu_diff.c \ dmu_object.c \ dmu_objset.c \ dmu_recv.c \ dmu_redact.c \ dmu_send.c \ dmu_traverse.c \ dmu_tx.c \ dmu_zfetch.c \ dnode.c \ dnode_sync.c \ dsl_bookmark.c \ dsl_dataset.c \ dsl_deadlist.c \ dsl_deleg.c \ dsl_dir.c \ dsl_crypt.c \ dsl_pool.c \ dsl_prop.c \ dsl_scan.c \ dsl_synctask.c \ dsl_destroy.c \ dsl_userhold.c \ edonr_zfs.c \ hkdf.c \ fm.c \ gzip.c \ lzjb.c \ lz4.c \ metaslab.c \ mmp.c \ multilist.c \ objlist.c \ pathname.c \ range_tree.c \ refcount.c \ rrwlock.c \ sa.c \ sha256.c \ skein_zfs.c \ spa.c \ spa_boot.c \ spa_checkpoint.c \ spa_config.c \ spa_errlog.c \ spa_history.c \ spa_log_spacemap.c \ spa_misc.c \ spa_stats.c \ space_map.c \ space_reftree.c \ txg.c \ trace.c \ uberblock.c \ unique.c \ vdev.c \ vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ vdev_indirect_mapping.c \ vdev_initialize.c \ vdev_label.c \ vdev_mirror.c \ vdev_missing.c \ vdev_queue.c \ vdev_raidz.c \ vdev_raidz_math_aarch64_neon.c \ vdev_raidz_math_aarch64_neonx2.c \ vdev_raidz_math_avx2.c \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_avx512f.c \ vdev_raidz_math.c \ vdev_raidz_math_scalar.c \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_raidz_math_powerpc_altivec.c \ vdev_rebuild.c \ vdev_removal.c \ vdev_root.c \ vdev_trim.c \ zap.c \ zap_leaf.c \ zap_micro.c \ zcp.c \ zcp_get.c \ zcp_global.c \ zcp_iter.c \ zcp_set.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ zfs_debug.c \ zfs_fm.c \ zfs_fuid.c \ zfs_racct.c \ zfs_sa.c \ zfs_znode.c \ zfs_ratelimit.c \ zfs_rlock.c \ zil.c \ zio.c \ zio_checksum.c \ zio_compress.c \ zio_crypt.c \ zio_inject.c \ zle.c \ zrlock.c \ zthr.c LUA_C = \ lapi.c \ lauxlib.c \ lbaselib.c \ lcode.c \ lcompat.c \ lcorolib.c \ lctype.c \ ldebug.c \ ldo.c \ lfunc.c \ lgc.c \ llex.c \ lmem.c \ lobject.c \ lopcodes.c \ lparser.c \ lstate.c \ lstring.c \ lstrlib.c \ ltable.c \ ltablib.c \ ltm.c \ lvm.c \ lzio.c dist_libzpool_la_SOURCES = \ $(USER_C) nodist_libzpool_la_SOURCES = \ $(KERNEL_C) \ $(LUA_C) libzpool_la_LIBADD = \ $(abs_top_builddir)/lib/libicp/libicp.la \ $(abs_top_builddir)/lib/libunicode/libunicode.la \ - $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ - $(abs_top_builddir)/lib/libzstd/libzstd.la + $(abs_top_builddir)/lib/libzstd/libzstd.la \ + $(abs_top_builddir)/lib/libzutil/libzutil.la libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm libzpool_la_LDFLAGS = -pthread if !ASAN_ENABLED libzpool_la_LDFLAGS += -Wl,-z,defs endif if BUILD_FREEBSD libzpool_la_LIBADD += -lgeom endif libzpool_la_LDFLAGS += -version-info 5:0:0 if TARGET_CPU_POWERPC vdev_raidz_math_powerpc_altivec.$(OBJEXT): CFLAGS += -maltivec vdev_raidz_math_powerpc_altivec.l$(OBJEXT): CFLAGS += -maltivec endif include $(top_srcdir)/config/CppCheck.am diff --git a/sys/contrib/openzfs/lib/libzpool/util.c b/sys/contrib/openzfs/lib/libzpool/util.c index 20cabe7c2e29..a2bdfec1d173 100644 --- a/sys/contrib/openzfs/lib/libzpool/util.c +++ b/sys/contrib/openzfs/lib/libzpool/util.c @@ -1,293 +1,351 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2017 Jason King * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Routines needed by more than one client of libzpool. */ static void show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) { vdev_stat_t *vs; vdev_stat_t *v0 = { 0 }; uint64_t sec; uint64_t is_log = 0; nvlist_t **child; uint_t c, children; char used[6], avail[6]; char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL); if (indent == 0 && desc != NULL) { (void) printf(" " " capacity operations bandwidth ---- errors ----\n"); (void) printf("description " "used avail read write read write read write cksum\n"); } if (desc != NULL) { char *suffix = "", *bias = NULL; char bias_suffix[32]; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) vs = v0; if (bias != NULL) { (void) snprintf(bias_suffix, sizeof (bias_suffix), " (%s)", bias); suffix = bias_suffix; } else if (is_log) { suffix = " (log)"; } sec = MAX(1, vs->vs_timestamp / NANOSEC); nicenum(vs->vs_alloc, used, sizeof (used)); nicenum(vs->vs_space - vs->vs_alloc, avail, sizeof (avail)); nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops, sizeof (rops)); nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops, sizeof (wops)); nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes, sizeof (rbytes)); nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes, sizeof (wbytes)); nicenum(vs->vs_read_errors, rerr, sizeof (rerr)); nicenum(vs->vs_write_errors, werr, sizeof (werr)); nicenum(vs->vs_checksum_errors, cerr, sizeof (cerr)); (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n", indent, "", desc, (int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)), suffix, vs->vs_space ? 6 : 0, vs->vs_space ? used : "", vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", rops, wops, rbytes, wbytes, rerr, werr, cerr); } umem_free(v0, sizeof (*v0)); if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) return; for (c = 0; c < children; c++) { nvlist_t *cnv = child[c]; char *cname = NULL, *tname; uint64_t np; int len; if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) && nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname)) cname = ""; len = strlen(cname) + 2; tname = umem_zalloc(len, UMEM_NOFAIL); (void) strlcpy(tname, cname, len); if (nvlist_lookup_uint64(cnv, ZPOOL_CONFIG_NPARITY, &np) == 0) tname[strlen(tname)] = '0' + np; show_vdev_stats(tname, ctype, cnv, indent + 2); umem_free(tname, len); } } void show_pool_stats(spa_t *spa) { nvlist_t *config, *nvroot; char *name; VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); show_vdev_stats(name, ZPOOL_CONFIG_CHILDREN, nvroot, 0); show_vdev_stats(NULL, ZPOOL_CONFIG_L2CACHE, nvroot, 0); show_vdev_stats(NULL, ZPOOL_CONFIG_SPARES, nvroot, 0); nvlist_free(config); } /* *k_out must be freed by the caller */ static int set_global_var_parse_kv(const char *arg, char **k_out, u_longlong_t *v_out) { int err; VERIFY(arg); char *d = strdup(arg); char *save = NULL; char *k = strtok_r(d, "=", &save); char *v_str = strtok_r(NULL, "=", &save); char *follow = strtok_r(NULL, "=", &save); if (k == NULL || v_str == NULL || follow != NULL) { err = EINVAL; goto err_free; } u_longlong_t val = strtoull(v_str, NULL, 0); if (val > UINT32_MAX) { fprintf(stderr, "Value for global variable '%s' must " "be a 32-bit unsigned integer, got '%s'\n", k, v_str); err = EOVERFLOW; goto err_free; } *k_out = k; *v_out = val; return (0); err_free: free(k); return (err); } /* * Sets given global variable in libzpool to given unsigned 32-bit value. * arg: "=" */ int set_global_var(char const *arg) { void *zpoolhdl; char *varname; u_longlong_t val; int ret; #ifndef _ZFS_LITTLE_ENDIAN /* * On big endian systems changing a 64-bit variable would set the high * 32 bits instead of the low 32 bits, which could cause unexpected * results. */ fprintf(stderr, "Setting global variables is only supported on " "little-endian systems\n"); ret = ENOTSUP; goto out_ret; #endif if ((ret = set_global_var_parse_kv(arg, &varname, &val)) != 0) { goto out_ret; } zpoolhdl = dlopen("libzpool.so", RTLD_LAZY); if (zpoolhdl != NULL) { uint32_t *var; var = dlsym(zpoolhdl, varname); if (var == NULL) { fprintf(stderr, "Global variable '%s' does not exist " "in libzpool.so\n", varname); ret = EINVAL; goto out_dlclose; } *var = (uint32_t)val; } else { fprintf(stderr, "Failed to open libzpool.so to set global " "variable\n"); ret = EIO; goto out_dlclose; } ret = 0; out_dlclose: dlclose(zpoolhdl); free(varname); out_ret: return (ret); } static nvlist_t * refresh_config(void *unused, nvlist_t *tryconfig) { return (spa_tryimport(tryconfig)); } +#if defined(__FreeBSD__) + +#include +#include +#include + static int -pool_active(void *unused, const char *name, uint64_t guid, - boolean_t *isactive) +pool_active(void *unused, const char *name, uint64_t guid, boolean_t *isactive) { - zfs_cmd_t *zcp; - nvlist_t *innvl; - char *packed = NULL; - size_t size = 0; - int fd, ret; + zfs_iocparm_t zp; + zfs_cmd_t *zc = NULL; + zfs_cmd_legacy_t *zcl = NULL; + unsigned long request; + int ret; + + int fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); + if (fd < 0) + return (-1); /* - * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active + * Use ZFS_IOC_POOL_STATS to check if the pool is active. We want to + * avoid adding a dependency on libzfs_core solely for this ioctl(), + * therefore we manually craft the stats command. Note that the command + * ID is identical between the openzfs and legacy ioctl() formats. */ + int ver = ZFS_IOCVER_NONE; + size_t ver_size = sizeof (ver); - fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); - if (fd < 0) - return (-1); + sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); + + switch (ver) { + case ZFS_IOCVER_OZFS: + zc = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); + + (void) strlcpy(zc->zc_name, name, sizeof (zc->zc_name)); + zp.zfs_cmd = (uint64_t)(uintptr_t)zc; + zp.zfs_cmd_size = sizeof (zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_OZFS; + + request = _IOWR('Z', ZFS_IOC_POOL_STATS, zfs_iocparm_t); + ret = ioctl(fd, request, &zp); + + free((void *)(uintptr_t)zc->zc_nvlist_dst); + umem_free(zc, sizeof (zfs_cmd_t)); + + break; + case ZFS_IOCVER_LEGACY: + zcl = umem_zalloc(sizeof (zfs_cmd_legacy_t), UMEM_NOFAIL); + + (void) strlcpy(zcl->zc_name, name, sizeof (zcl->zc_name)); + zp.zfs_cmd = (uint64_t)(uintptr_t)zcl; + zp.zfs_cmd_size = sizeof (zfs_cmd_legacy_t); + zp.zfs_ioctl_version = ZFS_IOCVER_LEGACY; + + request = _IOWR('Z', ZFS_IOC_POOL_STATS, zfs_iocparm_t); + ret = ioctl(fd, request, &zp); - zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); + free((void *)(uintptr_t)zcl->zc_nvlist_dst); + umem_free(zcl, sizeof (zfs_cmd_legacy_t)); - innvl = fnvlist_alloc(); - fnvlist_add_boolean_value(innvl, "force", B_FALSE); + break; + default: + fprintf(stderr, "unrecognized zfs ioctl version %d", ver); + exit(1); + } + + (void) close(fd); + + *isactive = (ret == 0); + return (0); +} +#else +static int +pool_active(void *unused, const char *name, uint64_t guid, + boolean_t *isactive) +{ + int fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); + if (fd < 0) + return (-1); + + /* + * Use ZFS_IOC_POOL_STATS to check if a pool is active. + */ + zfs_cmd_t *zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); (void) strlcpy(zcp->zc_name, name, sizeof (zcp->zc_name)); - packed = fnvlist_pack(innvl, &size); - zcp->zc_nvlist_src = (uint64_t)(uintptr_t)packed; - zcp->zc_nvlist_src_size = size; - ret = zfs_ioctl_fd(fd, ZFS_IOC_POOL_SYNC, zcp); + int ret = ioctl(fd, ZFS_IOC_POOL_STATS, zcp); - fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zcp->zc_nvlist_dst); - nvlist_free(innvl); umem_free(zcp, sizeof (zfs_cmd_t)); (void) close(fd); *isactive = (ret == 0); return (0); } +#endif const pool_config_ops_t libzpool_config_ops = { .pco_refresh_config = refresh_config, .pco_pool_active = pool_active, }; diff --git a/sys/contrib/openzfs/lib/libzutil/Makefile.am b/sys/contrib/openzfs/lib/libzutil/Makefile.am index 0bc29f05e0fb..b163250619ba 100644 --- a/sys/contrib/openzfs/lib/libzutil/Makefile.am +++ b/sys/contrib/openzfs/lib/libzutil/Makefile.am @@ -1,55 +1,47 @@ include $(top_srcdir)/config/Rules.am # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUDEV_CFLAGS) AM_CFLAGS += -fvisibility=hidden DEFAULT_INCLUDES += -I$(srcdir) noinst_LTLIBRARIES = libzutil.la USER_C = \ zutil_device_path.c \ zutil_import.c \ zutil_import.h \ zutil_nicenum.c \ zutil_pool.c if BUILD_LINUX USER_C += \ os/linux/zutil_device_path_os.c \ - os/linux/zutil_import_os.c \ - os/linux/zutil_compat.c + os/linux/zutil_import_os.c endif if BUILD_FREEBSD -DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs - USER_C += \ os/freebsd/zutil_device_path_os.c \ - os/freebsd/zutil_import_os.c \ - os/freebsd/zutil_compat.c - -VPATH += $(top_srcdir)/module/os/freebsd/zfs - -nodist_libzutil_la_SOURCES = zfs_ioctl_compat.c + os/freebsd/zutil_import_os.c endif libzutil_la_SOURCES = $(USER_C) libzutil_la_LIBADD = \ $(abs_top_builddir)/lib/libavl/libavl.la \ $(abs_top_builddir)/lib/libtpool/libtpool.la \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ $(abs_top_builddir)/lib/libspl/libspl.la if BUILD_LINUX libzutil_la_LIBADD += \ $(abs_top_builddir)/lib/libefi/libefi.la \ -lrt endif libzutil_la_LIBADD += -lm $(LIBBLKID_LIBS) $(LIBUDEV_LIBS) include $(top_srcdir)/config/CppCheck.am diff --git a/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c b/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c index 2d8900ce2483..3da661f4c557 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c +++ b/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c @@ -1,249 +1,254 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015 RackTop Systems. * Copyright 2016 Nexenta Systems, Inc. */ /* * Pool import support functions. * * To import a pool, we rely on reading the configuration information from the * ZFS label of each device. If we successfully read the label, then we * organize the configuration information in the following hierarchy: * * pool guid -> toplevel vdev guid -> label txg * * Duplicate entries matching this same tuple will be discarded. Once we have * examined every device, we pick the best label txg config for each toplevel * vdev. We then arrange these toplevel vdevs into a complete pool config, and * update any paths that have changed. Finally, we attempt to import the pool * using our derived config, and record the results. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zutil_import.h" /* * Update a leaf vdev's persistent device strings * * - only applies for a dedicated leaf vdev (aka whole disk) * - updated during pool create|add|attach|import * - used for matching device matching during auto-{online,expand,replace} * - stored in a leaf disk config label (i.e. alongside 'path' NVP) * - these strings are currently not used in kernel (i.e. for vdev_disk_open) * * On FreeBSD we currently just strip devid and phys_path to avoid confusion. */ void update_vdev_config_dev_strs(nvlist_t *nv) { (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); } /* * Do not even look at these devices. */ static const char * const excluded_devs[] = { "nfslock", "sequencer", "zfs", }; #define EXCLUDED_DIR "/dev/" #define EXCLUDED_DIR_LEN 5 void zpool_open_func(void *arg) { rdsk_node_t *rn = arg; struct stat64 statbuf; nvlist_t *config; size_t i; int num_labels; int fd; off_t mediasize = 0; /* * Do not even look at excluded devices. */ if (strncmp(rn->rn_name, EXCLUDED_DIR, EXCLUDED_DIR_LEN) == 0) { char *name = rn->rn_name + EXCLUDED_DIR_LEN; for (i = 0; i < nitems(excluded_devs); ++i) { const char *excluded_name = excluded_devs[i]; size_t len = strlen(excluded_name); if (strncmp(name, excluded_name, len) == 0) { return; } } } /* * O_NONBLOCK so we don't hang trying to open things like serial ports. */ if ((fd = open(rn->rn_name, O_RDONLY|O_NONBLOCK|O_CLOEXEC)) < 0) return; /* * Ignore failed stats. */ if (fstat64(fd, &statbuf) != 0) goto out; /* * We only want regular files, character devs and block devs. */ if (S_ISREG(statbuf.st_mode)) { /* Check if this file is too small to hold a zpool. */ if (statbuf.st_size < SPA_MINDEVSIZE) { goto out; } } else if (S_ISCHR(statbuf.st_mode) || S_ISBLK(statbuf.st_mode)) { /* Check if this device is too small to hold a zpool. */ if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) != 0 || mediasize < SPA_MINDEVSIZE) { goto out; } } else { goto out; } if (zpool_read_label(fd, &config, &num_labels) != 0) goto out; if (num_labels == 0) { nvlist_free(config); goto out; } rn->rn_config = config; rn->rn_num_labels = num_labels; /* TODO: Reuse labelpaths logic from Linux? */ out: (void) close(fd); } static const char * const zpool_default_import_path[] = { "/dev" }; const char * const * zpool_default_search_paths(size_t *count) { *count = nitems(zpool_default_import_path); return (zpool_default_import_path); } int zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t **slice_cache) { const char *oid = "vfs.zfs.vol.recursive"; char *end, path[MAXPATHLEN]; rdsk_node_t *slice; struct gmesh mesh; struct gclass *mp; struct ggeom *gp; struct gprovider *pp; avl_index_t where; int error, value; size_t pathleft, size = sizeof (value); boolean_t skip_zvols = B_FALSE; end = stpcpy(path, "/dev/"); pathleft = &path[sizeof (path)] - end; error = geom_gettree(&mesh); if (error != 0) return (error); if (sysctlbyname(oid, &value, &size, NULL, 0) == 0 && value == 0) skip_zvols = B_TRUE; *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); LIST_FOREACH(mp, &mesh.lg_class, lg_class) { if (skip_zvols && strcmp(mp->lg_name, "ZFS::ZVOL") == 0) continue; LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { strlcpy(end, pp->lg_name, pathleft); slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zutil_strdup(hdl, path); slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = *slice_cache; slice->rn_hdl = hdl; slice->rn_labelpaths = B_FALSE; slice->rn_order = IMPORT_ORDER_DEFAULT; pthread_mutex_lock(lock); if (avl_find(*slice_cache, slice, &where)) { free(slice->rn_name); free(slice); } else { avl_insert(*slice_cache, slice, where); } pthread_mutex_unlock(lock); } } } geom_deletetree(&mesh); return (0); } int zfs_dev_flush(int fd __unused) { return (0); } + +void +update_vdevs_config_dev_sysfs_path(nvlist_t *config) +{ +} diff --git a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c index 5defb526f210..ab692401d88e 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c +++ b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c @@ -1,851 +1,892 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2015 RackTop Systems. * Copyright (c) 2016, Intel Corporation. */ /* * Pool import support functions. * * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since * these commands are expected to run in the global zone, we can assume * that the devices are all readable when called. * * To import a pool, we rely on reading the configuration information from the * ZFS label of each device. If we successfully read the label, then we * organize the configuration information in the following hierarchy: * * pool guid -> toplevel vdev guid -> label txg * * Duplicate entries matching this same tuple will be discarded. Once we have * examined every device, we pick the best label txg config for each toplevel * vdev. We then arrange these toplevel vdevs into a complete pool config, and * update any paths that have changed. Finally, we attempt to import the pool * using our derived config, and record the results. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "zutil_import.h" #ifdef HAVE_LIBUDEV #include #include #endif #include #define DEV_BYID_PATH "/dev/disk/by-id/" /* * Skip devices with well known prefixes: * there can be side effects when opening devices which need to be avoided. * * hpet - High Precision Event Timer * watchdog[N] - Watchdog must be closed in a special way. */ static boolean_t should_skip_dev(const char *dev) { return ((strcmp(dev, "watchdog") == 0) || (strncmp(dev, "watchdog", 8) == 0 && isdigit(dev[8])) || (strcmp(dev, "hpet") == 0)); } int zfs_dev_flush(int fd) { return (ioctl(fd, BLKFLSBUF)); } void zpool_open_func(void *arg) { rdsk_node_t *rn = arg; libpc_handle_t *hdl = rn->rn_hdl; struct stat64 statbuf; nvlist_t *config; uint64_t vdev_guid = 0; int error; int num_labels = 0; int fd; if (should_skip_dev(zfs_basename(rn->rn_name))) return; /* * Ignore failed stats. We only want regular files and block devices. * Ignore files that are too small to hold a zpool. */ if (stat64(rn->rn_name, &statbuf) != 0 || (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)) || (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE)) return; /* * Preferentially open using O_DIRECT to bypass the block device * cache which may be stale for multipath devices. An EINVAL errno * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. */ fd = open(rn->rn_name, O_RDONLY | O_DIRECT | O_CLOEXEC); if ((fd < 0) && (errno == EINVAL)) fd = open(rn->rn_name, O_RDONLY | O_CLOEXEC); if ((fd < 0) && (errno == EACCES)) hdl->lpc_open_access_error = B_TRUE; if (fd < 0) return; error = zpool_read_label(fd, &config, &num_labels); if (error != 0) { (void) close(fd); return; } if (num_labels == 0) { (void) close(fd); nvlist_free(config); return; } /* * Check that the vdev is for the expected guid. Additional entries * are speculatively added based on the paths stored in the labels. * Entries with valid paths but incorrect guids must be removed. */ error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { (void) close(fd); nvlist_free(config); return; } (void) close(fd); rn->rn_config = config; rn->rn_num_labels = num_labels; /* * Add additional entries for paths described by this label. */ if (rn->rn_labelpaths) { char *path = NULL; char *devid = NULL; char *env = NULL; rdsk_node_t *slice; avl_index_t where; int timeout; int error; if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) return; env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || timeout < 0) { timeout = DISK_LABEL_WAIT; } /* * Allow devlinks to stabilize so all paths are available. */ zpool_label_disk_wait(rn->rn_name, timeout); if (path != NULL) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zutil_strdup(hdl, path); slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; slice->rn_order = IMPORT_ORDER_PREFERRED_1; slice->rn_labelpaths = B_FALSE; pthread_mutex_lock(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { pthread_mutex_unlock(rn->rn_lock); free(slice->rn_name); free(slice); } else { avl_insert(rn->rn_avl, slice, where); pthread_mutex_unlock(rn->rn_lock); zpool_open_func(slice); } } if (devid != NULL) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); error = asprintf(&slice->rn_name, "%s%s", DEV_BYID_PATH, devid); if (error == -1) { free(slice); return; } slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; slice->rn_order = IMPORT_ORDER_PREFERRED_2; slice->rn_labelpaths = B_FALSE; pthread_mutex_lock(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { pthread_mutex_unlock(rn->rn_lock); free(slice->rn_name); free(slice); } else { avl_insert(rn->rn_avl, slice, where); pthread_mutex_unlock(rn->rn_lock); zpool_open_func(slice); } } } } static const char * const zpool_default_import_path[] = { "/dev/disk/by-vdev", /* Custom rules, use first if they exist */ "/dev/mapper", /* Use multipath devices before components */ "/dev/disk/by-partlabel", /* Single unique entry set by user */ "/dev/disk/by-partuuid", /* Generated partition uuid */ "/dev/disk/by-label", /* Custom persistent labels */ "/dev/disk/by-uuid", /* Single unique entry and persistent */ "/dev/disk/by-id", /* May be multiple entries and persistent */ "/dev/disk/by-path", /* Encodes physical location and persistent */ "/dev" /* UNSAFE device names will change */ }; const char * const * zpool_default_search_paths(size_t *count) { *count = ARRAY_SIZE(zpool_default_import_path); return (zpool_default_import_path); } /* * Given a full path to a device determine if that device appears in the * import search path. If it does return the first match and store the * index in the passed 'order' variable, otherwise return an error. */ static int zfs_path_order(char *name, int *order) { int i, error = ENOENT; char *dir, *env, *envdup, *tmp = NULL; env = getenv("ZPOOL_IMPORT_PATH"); if (env) { envdup = strdup(env); for (dir = strtok_r(envdup, ":", &tmp), i = 0; dir != NULL; dir = strtok_r(NULL, ":", &tmp), i++) { if (strncmp(name, dir, strlen(dir)) == 0) { *order = i; error = 0; break; } } free(envdup); } else { for (i = 0; i < ARRAY_SIZE(zpool_default_import_path); i++) { if (strncmp(name, zpool_default_import_path[i], strlen(zpool_default_import_path[i])) == 0) { *order = i; error = 0; break; } } } return (error); } /* * Use libblkid to quickly enumerate all known zfs devices. */ int zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t **slice_cache) { rdsk_node_t *slice; blkid_cache cache; blkid_dev_iterate iter; blkid_dev dev; avl_index_t where; int error; *slice_cache = NULL; error = blkid_get_cache(&cache, NULL); if (error != 0) return (error); error = blkid_probe_all_new(cache); if (error != 0) { blkid_put_cache(cache); return (error); } iter = blkid_dev_iterate_begin(cache); if (iter == NULL) { blkid_put_cache(cache); return (EINVAL); } error = blkid_dev_set_search(iter, "TYPE", "zfs_member"); if (error != 0) { blkid_dev_iterate_end(iter); blkid_put_cache(cache); return (error); } *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); while (blkid_dev_next(iter, &dev) == 0) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev)); slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = *slice_cache; slice->rn_hdl = hdl; slice->rn_labelpaths = B_TRUE; error = zfs_path_order(slice->rn_name, &slice->rn_order); if (error == 0) slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; else slice->rn_order = IMPORT_ORDER_DEFAULT; pthread_mutex_lock(lock); if (avl_find(*slice_cache, slice, &where)) { free(slice->rn_name); free(slice); } else { avl_insert(*slice_cache, slice, where); } pthread_mutex_unlock(lock); } blkid_dev_iterate_end(iter); blkid_put_cache(cache); return (0); } /* * Linux persistent device strings for vdev labels * * based on libudev for consistency with libudev disk add/remove events */ typedef struct vdev_dev_strs { char vds_devid[128]; char vds_devphys[128]; } vdev_dev_strs_t; #ifdef HAVE_LIBUDEV /* * Obtain the persistent device id string (describes what) * * used by ZED vdev matching for auto-{online,expand,replace} */ int zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) { struct udev_list_entry *entry; const char *bus; char devbyid[MAXPATHLEN]; /* The bus based by-id path is preferred */ bus = udev_device_get_property_value(dev, "ID_BUS"); if (bus == NULL) { const char *dm_uuid; /* * For multipath nodes use the persistent uuid based identifier * * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f */ dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); if (dm_uuid != NULL) { (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); return (0); } /* * For volumes use the persistent /dev/zvol/dataset identifier */ entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { const char *name; name = udev_list_entry_get_name(entry); if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { (void) strlcpy(bufptr, name, buflen); return (0); } entry = udev_list_entry_get_next(entry); } /* * NVME 'by-id' symlinks are similar to bus case */ struct udev_device *parent; parent = udev_device_get_parent_with_subsystem_devtype(dev, "nvme", NULL); if (parent != NULL) bus = "nvme"; /* continue with bus symlink search */ else return (ENODATA); } /* * locate the bus specific by-id link */ (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { const char *name; name = udev_list_entry_get_name(entry); if (strncmp(name, devbyid, strlen(devbyid)) == 0) { name += strlen(DEV_BYID_PATH); (void) strlcpy(bufptr, name, buflen); return (0); } entry = udev_list_entry_get_next(entry); } return (ENODATA); } /* * Obtain the persistent physical location string (describes where) * * used by ZED vdev matching for auto-{online,expand,replace} */ int zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) { const char *physpath = NULL; struct udev_list_entry *entry; /* * Normal disks use ID_PATH for their physical path. */ physpath = udev_device_get_property_value(dev, "ID_PATH"); if (physpath != NULL && strlen(physpath) > 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } /* * Device mapper devices are virtual and don't have a physical * path. For them we use ID_VDEV instead, which is setup via the * /etc/vdev_id.conf file. ID_VDEV provides a persistent path * to a virtual device. If you don't have vdev_id.conf setup, * you cannot use multipath autoreplace with device mapper. */ physpath = udev_device_get_property_value(dev, "ID_VDEV"); if (physpath != NULL && strlen(physpath) > 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } /* * For ZFS volumes use the persistent /dev/zvol/dataset identifier */ entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { physpath = udev_list_entry_get_name(entry); if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } entry = udev_list_entry_get_next(entry); } /* * For all other devices fallback to using the by-uuid name. */ entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { physpath = udev_list_entry_get_name(entry); if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } entry = udev_list_entry_get_next(entry); } return (ENODATA); } /* * A disk is considered a multipath whole disk when: * DEVNAME key value has "dm-" * DM_NAME key value has "mpath" prefix * DM_UUID key exists * ID_PART_TABLE_TYPE key does not exist or is not gpt */ static boolean_t udev_mpath_whole_disk(struct udev_device *dev) { const char *devname, *type, *uuid; devname = udev_device_get_property_value(dev, "DEVNAME"); type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); uuid = udev_device_get_property_value(dev, "DM_UUID"); if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && ((type == NULL) || (strcmp(type, "gpt") != 0)) && (uuid != NULL)) { return (B_TRUE); } return (B_FALSE); } static int udev_device_is_ready(struct udev_device *dev) { #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED return (udev_device_get_is_initialized(dev)); #else /* wait for DEVLINKS property to be initialized */ return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); #endif } #else /* ARGSUSED */ int zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) { return (ENODATA); } /* ARGSUSED */ int zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) { return (ENODATA); } #endif /* HAVE_LIBUDEV */ /* * Wait up to timeout_ms for udev to set up the device node. The device is * considered ready when libudev determines it has been initialized, all of * the device links have been verified to exist, and it has been allowed to * settle. At this point the device the device can be accessed reliably. * Depending on the complexity of the udev rules this process could take * several seconds. */ int zpool_label_disk_wait(const char *path, int timeout_ms) { #ifdef HAVE_LIBUDEV struct udev *udev; struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname = NULL; int ret = ENODEV; int settle_ms = 50; long sleep_ms = 10; hrtime_t start, settle; if ((udev = udev_new()) == NULL) return (ENXIO); start = gethrtime(); settle = 0; do { if (sysname == NULL) { if (realpath(path, nodepath) != NULL) { sysname = strrchr(nodepath, '/') + 1; } else { (void) usleep(sleep_ms * MILLISEC); continue; } } dev = udev_device_new_from_subsystem_sysname(udev, "block", sysname); if ((dev != NULL) && udev_device_is_ready(dev)) { struct udev_list_entry *links, *link = NULL; ret = 0; links = udev_device_get_devlinks_list_entry(dev); udev_list_entry_foreach(link, links) { struct stat64 statbuf; const char *name; name = udev_list_entry_get_name(link); errno = 0; if (stat64(name, &statbuf) == 0 && errno == 0) continue; settle = 0; ret = ENODEV; break; } if (ret == 0) { if (settle == 0) { settle = gethrtime(); } else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) { udev_device_unref(dev); break; } } } udev_device_unref(dev); (void) usleep(sleep_ms * MILLISEC); } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); udev_unref(udev); return (ret); #else int settle_ms = 50; long sleep_ms = 10; hrtime_t start, settle; struct stat64 statbuf; start = gethrtime(); settle = 0; do { errno = 0; if ((stat64(path, &statbuf) == 0) && (errno == 0)) { if (settle == 0) settle = gethrtime(); else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) return (0); } else if (errno != ENOENT) { return (errno); } usleep(sleep_ms * MILLISEC); } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); return (ENODEV); #endif /* HAVE_LIBUDEV */ } /* * Encode the persistent devices strings * used for the vdev disk label */ static int encode_device_strings(const char *path, vdev_dev_strs_t *ds, boolean_t wholedisk) { #ifdef HAVE_LIBUDEV struct udev *udev; struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname; int ret = ENODEV; hrtime_t start; if ((udev = udev_new()) == NULL) return (ENXIO); /* resolve path to a runtime device node instance */ if (realpath(path, nodepath) == NULL) goto no_dev; sysname = strrchr(nodepath, '/') + 1; /* * Wait up to 3 seconds for udev to set up the device node context */ start = gethrtime(); do { dev = udev_device_new_from_subsystem_sysname(udev, "block", sysname); if (dev == NULL) goto no_dev; if (udev_device_is_ready(dev)) break; /* udev ready */ udev_device_unref(dev); dev = NULL; if (NSEC2MSEC(gethrtime() - start) < 10) (void) sched_yield(); /* yield/busy wait up to 10ms */ else (void) usleep(10 * MILLISEC); } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); if (dev == NULL) goto no_dev; /* * Only whole disks require extra device strings */ if (!wholedisk && !udev_mpath_whole_disk(dev)) goto no_dev; ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); if (ret != 0) goto no_dev_ref; /* physical location string (optional) */ if (zfs_device_get_physical(dev, ds->vds_devphys, sizeof (ds->vds_devphys)) != 0) { ds->vds_devphys[0] = '\0'; /* empty string --> not available */ } no_dev_ref: udev_device_unref(dev); no_dev: udev_unref(udev); return (ret); #else return (ENOENT); #endif } +/* + * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it + * in the nvlist * (if applicable). Like: + * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + */ +static void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, char *path) +{ + char *upath, *spath; + + /* Add enclosure sysfs path (if disk is in an enclosure). */ + upath = zfs_get_underlying_path(path); + spath = zfs_get_enclosure_sysfs_path(upath); + + if (spath) { + nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, spath); + } else { + nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + } + + free(upath); + free(spath); +} + +/* + * This will get called for each leaf vdev. + */ +static int +sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data) +{ + char *path = NULL; + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return (1); + + /* Rescan our enclosure sysfs path for this vdev */ + update_vdev_config_dev_sysfs_path(nv, path); + return (0); +} + +/* + * Given an nvlist for our pool (with vdev tree), iterate over all the + * leaf vdevs and update their ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH. + */ +void +update_vdevs_config_dev_sysfs_path(nvlist_t *config) +{ + nvlist_t *nvroot = NULL; + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + for_each_vdev_in_nvlist(nvroot, sysfs_path_pool_vdev_iter_f, NULL); +} + /* * Update a leaf vdev's persistent device strings * * - only applies for a dedicated leaf vdev (aka whole disk) * - updated during pool create|add|attach|import * - used for matching device matching during auto-{online,expand,replace} * - stored in a leaf disk config label (i.e. alongside 'path' NVP) * - these strings are currently not used in kernel (i.e. for vdev_disk_open) * * single device node example: * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' * * multipath device node example: * devid: 'dm-uuid-mpath-35000c5006304de3f' * * We also store the enclosure sysfs path for turning on enclosure LEDs * (if applicable): * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' */ void update_vdev_config_dev_strs(nvlist_t *nv) { vdev_dev_strs_t vds; char *env, *type, *path; uint64_t wholedisk = 0; - char *upath, *spath; /* * For the benefit of legacy ZFS implementations, allow * for opting out of devid strings in the vdev label. * * example use: * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer * * explanation: * Older OpenZFS implementations had issues when attempting to * display pool config VDEV names if a "devid" NVP value is * present in the pool's config. * * For example, a pool that originated on illumos platform would * have a devid value in the config and "zpool status" would fail * when listing the config. * * A pool can be stripped of any "devid" values on import or * prevented from adding them on zpool create|add by setting * ZFS_VDEV_DEVID_OPT_OUT. */ env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); if (env && (strtoul(env, NULL, 0) > 0 || !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); return; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || strcmp(type, VDEV_TYPE_DISK) != 0) { return; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); /* * Update device string values in the config nvlist. */ if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); if (vds.vds_devphys[0] != '\0') { (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vds.vds_devphys); } - - /* Add enclosure sysfs path (if disk is in an enclosure). */ - upath = zfs_get_underlying_path(path); - spath = zfs_get_enclosure_sysfs_path(upath); - if (spath) - nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, - spath); - else - nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); - - free(upath); - free(spath); + update_vdev_config_dev_sysfs_path(nv, path); } else { /* Clear out any stale entries. */ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); } } diff --git a/sys/contrib/openzfs/lib/libzutil/zutil_import.c b/sys/contrib/openzfs/lib/libzutil/zutil_import.c index 04b9f26abb92..9eb55aaf77ce 100644 --- a/sys/contrib/openzfs/lib/libzutil/zutil_import.c +++ b/sys/contrib/openzfs/lib/libzutil/zutil_import.c @@ -1,1861 +1,1929 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2015 RackTop Systems. * Copyright (c) 2016, Intel Corporation. * Copyright (c) 2021, Colm Buckley */ /* * Pool import support functions. * * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since * these commands are expected to run in the global zone, we can assume * that the devices are all readable when called. * * To import a pool, we rely on reading the configuration information from the * ZFS label of each device. If we successfully read the label, then we * organize the configuration information in the following hierarchy: * * pool guid -> toplevel vdev guid -> label txg * * Duplicate entries matching this same tuple will be discarded. Once we have * examined every device, we pick the best label txg config for each toplevel * vdev. We then arrange these toplevel vdevs into a complete pool config, and * update any paths that have changed. Finally, we attempt to import the pool * using our derived config, and record the results. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zutil_import.h" static __attribute__((format(printf, 2, 3))) void zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap); hdl->lpc_desc_active = B_TRUE; va_end(ap); } static void zutil_verror(libpc_handle_t *hdl, const char *error, const char *fmt, va_list ap) { char action[1024]; (void) vsnprintf(action, sizeof (action), fmt, ap); if (hdl->lpc_desc_active) hdl->lpc_desc_active = B_FALSE; else hdl->lpc_desc[0] = '\0'; if (hdl->lpc_printerr) { if (hdl->lpc_desc[0] != '\0') error = hdl->lpc_desc; (void) fprintf(stderr, "%s: %s\n", action, error); } } static __attribute__((format(printf, 3, 4))) int zutil_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); zutil_verror(hdl, error, fmt, ap); va_end(ap); return (-1); } static int zutil_error(libpc_handle_t *hdl, const char *error, const char *msg) { return (zutil_error_fmt(hdl, error, "%s", msg)); } static int zutil_no_memory(libpc_handle_t *hdl) { zutil_error(hdl, EZFS_NOMEM, "internal error"); exit(1); } void * zutil_alloc(libpc_handle_t *hdl, size_t size) { void *data; if ((data = calloc(1, size)) == NULL) (void) zutil_no_memory(hdl); return (data); } char * zutil_strdup(libpc_handle_t *hdl, const char *str) { char *ret; if ((ret = strdup(str)) == NULL) (void) zutil_no_memory(hdl); return (ret); } static char * zutil_strndup(libpc_handle_t *hdl, const char *str, size_t n) { char *ret; if ((ret = strndup(str, n)) == NULL) (void) zutil_no_memory(hdl); return (ret); } /* * Intermediate structures used to gather configuration information. */ typedef struct config_entry { uint64_t ce_txg; nvlist_t *ce_config; struct config_entry *ce_next; } config_entry_t; typedef struct vdev_entry { uint64_t ve_guid; config_entry_t *ve_configs; struct vdev_entry *ve_next; } vdev_entry_t; typedef struct pool_entry { uint64_t pe_guid; vdev_entry_t *pe_vdevs; struct pool_entry *pe_next; } pool_entry_t; typedef struct name_entry { char *ne_name; uint64_t ne_guid; uint64_t ne_order; uint64_t ne_num_labels; struct name_entry *ne_next; } name_entry_t; typedef struct pool_list { pool_entry_t *pools; name_entry_t *names; } pool_list_t; /* * Go through and fix up any path and/or devid information for the given vdev * configuration. */ static int fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names) { nvlist_t **child; uint_t c, children; uint64_t guid; name_entry_t *ne, *best; char *path; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (fix_paths(hdl, child[c], names) != 0) return (-1); return (0); } /* * This is a leaf (file or disk) vdev. In either case, go through * the name list and see if we find a matching guid. If so, replace * the path and see if we can calculate a new devid. * * There may be multiple names associated with a particular guid, in * which case we have overlapping partitions or multiple paths to the * same disk. In this case we prefer to use the path name which * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we * use the lowest order device which corresponds to the first match * while traversing the ZPOOL_IMPORT_PATH search path. */ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) path = NULL; best = NULL; for (ne = names; ne != NULL; ne = ne->ne_next) { if (ne->ne_guid == guid) { if (path == NULL) { best = ne; break; } if ((strlen(path) == strlen(ne->ne_name)) && strncmp(path, ne->ne_name, strlen(path)) == 0) { best = ne; break; } if (best == NULL) { best = ne; continue; } /* Prefer paths with move vdev labels. */ if (ne->ne_num_labels > best->ne_num_labels) { best = ne; continue; } /* Prefer paths earlier in the search order. */ if (ne->ne_num_labels == best->ne_num_labels && ne->ne_order < best->ne_order) { best = ne; continue; } } } if (best == NULL) return (0); if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) return (-1); update_vdev_config_dev_strs(nv); return (0); } /* * Add the given configuration to the list of known devices. */ static int add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, int order, int num_labels, nvlist_t *config) { uint64_t pool_guid, vdev_guid, top_guid, txg, state; pool_entry_t *pe; vdev_entry_t *ve; config_entry_t *ce; name_entry_t *ne; /* * If this is a hot spare not currently in use or level 2 cache * device, add it to the list of names to translate, but don't do * anything else. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) == 0 && (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) return (-1); if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { free(ne); return (-1); } ne->ne_guid = vdev_guid; ne->ne_order = order; ne->ne_num_labels = num_labels; ne->ne_next = pl->names; pl->names = ne; return (0); } /* * If we have a valid config but cannot read any of these fields, then * it means we have a half-initialized label. In vdev_label_init() * we write a label with txg == 0 so that we can identify the device * in case the user refers to the same disk later on. If we fail to * create the pool, we'll be left with a label in this state * which should not be considered part of a valid pool. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) != 0 || nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0) { return (0); } /* * First, see if we know about this pool. If not, then add it to the * list of known pools. */ for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { if (pe->pe_guid == pool_guid) break; } if (pe == NULL) { if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) { return (-1); } pe->pe_guid = pool_guid; pe->pe_next = pl->pools; pl->pools = pe; } /* * Second, see if we know about this toplevel vdev. Add it if its * missing. */ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { if (ve->ve_guid == top_guid) break; } if (ve == NULL) { if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { return (-1); } ve->ve_guid = top_guid; ve->ve_next = pe->pe_vdevs; pe->pe_vdevs = ve; } /* * Third, see if we have a config with a matching transaction group. If * so, then we do nothing. Otherwise, add it to the list of known * configs. */ for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { if (ce->ce_txg == txg) break; } if (ce == NULL) { if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) { return (-1); } ce->ce_txg = txg; ce->ce_config = fnvlist_dup(config); ce->ce_next = ve->ve_configs; ve->ve_configs = ce; } /* * At this point we've successfully added our config to the list of * known configs. The last thing to do is add the vdev guid -> path * mappings so that we can fix up the configuration as necessary before * doing the import. */ if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) return (-1); if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { free(ne); return (-1); } ne->ne_guid = vdev_guid; ne->ne_order = order; ne->ne_num_labels = num_labels; ne->ne_next = pl->names; pl->names = ne; return (0); } static int zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, boolean_t *isactive) { ASSERT(hdl->lpc_ops->pco_pool_active != NULL); int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name, guid, isactive); return (error); } static nvlist_t * zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) { ASSERT(hdl->lpc_ops->pco_refresh_config != NULL); return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle, tryconfig)); } /* * Determine if the vdev id is a hole in the namespace. */ static boolean_t vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) { int c; for (c = 0; c < holes; c++) { /* Top-level is a hole */ if (hole_array[c] == id) return (B_TRUE); } return (B_FALSE); } /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, * we assemble the toplevel vdevs into a full config for the pool. We make a * pass to fix up any incorrect paths, and then add it to the main list to * return to the user. */ static nvlist_t * get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, nvlist_t *policy) { pool_entry_t *pe; vdev_entry_t *ve; config_entry_t *ce; nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; nvlist_t **spares, **l2cache; uint_t i, nspares, nl2cache; boolean_t config_seen; uint64_t best_txg; char *name, *hostname = NULL; uint64_t guid; uint_t children = 0; nvlist_t **child = NULL; uint_t holes; uint64_t *hole_array, max_id; uint_t c; boolean_t isactive; uint64_t hostid; nvlist_t *nvl; boolean_t valid_top_config = B_FALSE; if (nvlist_alloc(&ret, 0, 0) != 0) goto nomem; for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { uint64_t id, max_txg = 0; if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) goto nomem; config_seen = B_FALSE; /* * Iterate over all toplevel vdevs. Grab the pool configuration * from the first one we find, and then go through the rest and * add them as necessary to the 'vdevs' member of the config. */ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { /* * Determine the best configuration for this vdev by * selecting the config with the latest transaction * group. */ best_txg = 0; for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { if (ce->ce_txg > best_txg) { tmp = ce->ce_config; best_txg = ce->ce_txg; } } /* * We rely on the fact that the max txg for the * pool will contain the most up-to-date information * about the valid top-levels in the vdev namespace. */ if (best_txg > max_txg) { (void) nvlist_remove(config, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64); (void) nvlist_remove(config, ZPOOL_CONFIG_HOLE_ARRAY, DATA_TYPE_UINT64_ARRAY); max_txg = best_txg; hole_array = NULL; holes = 0; max_id = 0; valid_top_config = B_FALSE; if (nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { verify(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, max_id) == 0); valid_top_config = B_TRUE; } if (nvlist_lookup_uint64_array(tmp, ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, &holes) == 0) { verify(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, hole_array, holes) == 0); } } if (!config_seen) { /* * Copy the relevant pieces of data to the pool * configuration: * * version * pool guid * name * comment (if available) * compatibility features (if available) * pool state * hostid (if available) * hostname (if available) */ uint64_t state, version; char *comment = NULL; char *compatibility = NULL; version = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_VERSION); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, version); guid = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_POOL_GUID); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, guid); name = fnvlist_lookup_string(tmp, ZPOOL_CONFIG_POOL_NAME); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, name); if (nvlist_lookup_string(tmp, ZPOOL_CONFIG_COMMENT, &comment) == 0) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, comment); if (nvlist_lookup_string(tmp, ZPOOL_CONFIG_COMPATIBILITY, &compatibility) == 0) fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY, compatibility); state = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_POOL_STATE); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); hostid = 0; if (nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); hostname = fnvlist_lookup_string(tmp, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, hostname); } config_seen = B_TRUE; } /* * Add this top-level vdev to the child array. */ verify(nvlist_lookup_nvlist(tmp, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, &id) == 0); if (id >= children) { nvlist_t **newchild; newchild = zutil_alloc(hdl, (id + 1) * sizeof (nvlist_t *)); if (newchild == NULL) goto nomem; for (c = 0; c < children; c++) newchild[c] = child[c]; free(child); child = newchild; children = id + 1; } if (nvlist_dup(nvtop, &child[id], 0) != 0) goto nomem; } /* * If we have information about all the top-levels then * clean up the nvlist which we've constructed. This * means removing any extraneous devices that are * beyond the valid range or adding devices to the end * of our array which appear to be missing. */ if (valid_top_config) { if (max_id < children) { for (c = max_id; c < children; c++) nvlist_free(child[c]); children = max_id; } else if (max_id > children) { nvlist_t **newchild; newchild = zutil_alloc(hdl, (max_id) * sizeof (nvlist_t *)); if (newchild == NULL) goto nomem; for (c = 0; c < children; c++) newchild[c] = child[c]; free(child); child = newchild; children = max_id; } } verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); /* * The vdev namespace may contain holes as a result of * device removal. We must add them back into the vdev * tree before we process any missing devices. */ if (holes > 0) { ASSERT(valid_top_config); for (c = 0; c < children; c++) { nvlist_t *holey; if (child[c] != NULL || !vdev_is_hole(hole_array, holes, c)) continue; if (nvlist_alloc(&holey, NV_UNIQUE_NAME, 0) != 0) goto nomem; /* * Holes in the namespace are treated as * "hole" top-level vdevs and have a * special flag set on them. */ if (nvlist_add_string(holey, ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) != 0 || nvlist_add_uint64(holey, ZPOOL_CONFIG_ID, c) != 0 || nvlist_add_uint64(holey, ZPOOL_CONFIG_GUID, 0ULL) != 0) { nvlist_free(holey); goto nomem; } child[c] = holey; } } /* * Look for any missing top-level vdevs. If this is the case, * create a faked up 'missing' vdev as a placeholder. We cannot * simply compress the child array, because the kernel performs * certain checks to make sure the vdev IDs match their location * in the configuration. */ for (c = 0; c < children; c++) { if (child[c] == NULL) { nvlist_t *missing; if (nvlist_alloc(&missing, NV_UNIQUE_NAME, 0) != 0) goto nomem; if (nvlist_add_string(missing, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MISSING) != 0 || nvlist_add_uint64(missing, ZPOOL_CONFIG_ID, c) != 0 || nvlist_add_uint64(missing, ZPOOL_CONFIG_GUID, 0ULL) != 0) { nvlist_free(missing); goto nomem; } child[c] = missing; } } /* * Put all of this pool's top-level vdevs into a root vdev. */ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) goto nomem; if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, child, children) != 0) { nvlist_free(nvroot); goto nomem; } for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); children = 0; child = NULL; /* * Go through and fix up any paths and/or devids based on our * known list of vdev GUID -> path mappings. */ if (fix_paths(hdl, nvroot, pl->names) != 0) { nvlist_free(nvroot); goto nomem; } /* * Add the root vdev to this pool's configuration. */ if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) != 0) { nvlist_free(nvroot); goto nomem; } nvlist_free(nvroot); /* * zdb uses this path to report on active pools that were * imported or created using -R. */ if (active_ok) goto add_pool; /* * Determine if this pool is currently active, in which case we * can't actually import it. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (zutil_pool_active(hdl, name, guid, &isactive) != 0) goto error; if (isactive) { nvlist_free(config); config = NULL; continue; } if (policy != NULL) { if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, policy) != 0) goto nomem; } if ((nvl = zutil_refresh_config(hdl, config)) == NULL) { nvlist_free(config); config = NULL; continue; } nvlist_free(config); config = nvl; /* * Go through and update the paths for spares, now that we have * them. */ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { for (i = 0; i < nspares; i++) { if (fix_paths(hdl, spares[i], pl->names) != 0) goto nomem; } } /* * Update the paths for l2cache devices. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { for (i = 0; i < nl2cache; i++) { if (fix_paths(hdl, l2cache[i], pl->names) != 0) goto nomem; } } /* * Restore the original information read from the actual label. */ (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, DATA_TYPE_UINT64); (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, DATA_TYPE_STRING); if (hostid != 0) { verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid) == 0); verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, hostname) == 0); } add_pool: /* * Add this pool to the list of configs. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (nvlist_add_nvlist(ret, name, config) != 0) goto nomem; nvlist_free(config); config = NULL; } return (ret); nomem: (void) zutil_no_memory(hdl); error: nvlist_free(config); nvlist_free(ret); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); return (NULL); } /* * Return the offset of the given label. */ static uint64_t label_offset(uint64_t size, int l) { ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); } /* * The same description applies as to zpool_read_label below, * except here we do it without aio, presumably because an aio call * errored out in a way we think not using it could circumvent. */ static int zpool_read_label_slow(int fd, nvlist_t **config, int *num_labels) { struct stat64 statbuf; int l, count = 0; vdev_phys_t *label; nvlist_t *expected_config = NULL; uint64_t expected_guid = 0, size; int error; *config = NULL; if (fstat64_blk(fd, &statbuf) == -1) return (0); size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label)); if (error) return (-1); for (l = 0; l < VDEV_LABELS; l++) { uint64_t state, guid, txg; off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; if (pread64(fd, label, sizeof (vdev_phys_t), offset) != sizeof (vdev_phys_t)) continue; if (nvlist_unpack(label->vp_nvlist, sizeof (label->vp_nvlist), config, 0) != 0) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, &guid) != 0 || guid == 0) { nvlist_free(*config); continue; } if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(*config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(*config); continue; } if (expected_guid) { if (expected_guid == guid) count++; nvlist_free(*config); } else { expected_config = *config; expected_guid = guid; count++; } } if (num_labels != NULL) *num_labels = count; free(label); *config = expected_config; return (0); } /* * Given a file descriptor, read the label information and return an nvlist * describing the configuration, if there is one. The number of valid * labels found will be returned in num_labels when non-NULL. */ int zpool_read_label(int fd, nvlist_t **config, int *num_labels) { struct stat64 statbuf; struct aiocb aiocbs[VDEV_LABELS]; struct aiocb *aiocbps[VDEV_LABELS]; vdev_phys_t *labels; nvlist_t *expected_config = NULL; uint64_t expected_guid = 0, size; int error, l, count = 0; *config = NULL; if (fstat64_blk(fd, &statbuf) == -1) return (0); size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); error = posix_memalign((void **)&labels, PAGESIZE, VDEV_LABELS * sizeof (*labels)); if (error) return (-1); memset(aiocbs, 0, sizeof (aiocbs)); for (l = 0; l < VDEV_LABELS; l++) { off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; aiocbs[l].aio_fildes = fd; aiocbs[l].aio_offset = offset; aiocbs[l].aio_buf = &labels[l]; aiocbs[l].aio_nbytes = sizeof (vdev_phys_t); aiocbs[l].aio_lio_opcode = LIO_READ; aiocbps[l] = &aiocbs[l]; } if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) { int saved_errno = errno; boolean_t do_slow = B_FALSE; error = -1; if (errno == EAGAIN || errno == EINTR || errno == EIO) { /* * A portion of the requests may have been submitted. * Clean them up. */ for (l = 0; l < VDEV_LABELS; l++) { errno = 0; switch (aio_error(&aiocbs[l])) { case EINVAL: break; case EINPROGRESS: // This shouldn't be possible to // encounter, die if we do. ASSERT(B_FALSE); fallthrough; case EOPNOTSUPP: case ENOSYS: do_slow = B_TRUE; fallthrough; case 0: default: (void) aio_return(&aiocbs[l]); } } } if (do_slow) { /* * At least some IO involved access unsafe-for-AIO * files. Let's try again, without AIO this time. */ error = zpool_read_label_slow(fd, config, num_labels); saved_errno = errno; } free(labels); errno = saved_errno; return (error); } for (l = 0; l < VDEV_LABELS; l++) { uint64_t state, guid, txg; if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t)) continue; if (nvlist_unpack(labels[l].vp_nvlist, sizeof (labels[l].vp_nvlist), config, 0) != 0) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, &guid) != 0 || guid == 0) { nvlist_free(*config); continue; } if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(*config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(*config); continue; } if (expected_guid) { if (expected_guid == guid) count++; nvlist_free(*config); } else { expected_config = *config; expected_guid = guid; count++; } } if (num_labels != NULL) *num_labels = count; free(labels); *config = expected_config; return (0); } /* * Sorted by full path and then vdev guid to allow for multiple entries with * the same full path name. This is required because it's possible to * have multiple block devices with labels that refer to the same * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both * entries need to be added to the cache. Scenarios where this can occur * include overwritten pool labels, devices which are visible from multiple * hosts and multipath devices. */ int slice_cache_compare(const void *arg1, const void *arg2) { const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; int rv; rv = TREE_ISIGN(strcmp(nm1, nm2)); if (rv) return (rv); return (TREE_CMP(guid1, guid2)); } static int label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, uint64_t vdev_guid, char **path, char **devid) { nvlist_t **child; uint_t c, children; uint64_t guid; char *val; int error; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) { error = label_paths_impl(hdl, child[c], pool_guid, vdev_guid, path, devid); if (error) return (error); } return (0); } if (nvroot == NULL) return (0); error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid); if ((error != 0) || (guid != vdev_guid)) return (0); error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val); if (error == 0) *path = val; error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val); if (error == 0) *devid = val; return (0); } /* * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID * and store these strings as config_path and devid_path respectively. * The returned pointers are only valid as long as label remains valid. */ int label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid) { nvlist_t *nvroot; uint64_t pool_guid; uint64_t vdev_guid; *path = NULL; *devid = NULL; if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid)) return (ENOENT); return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, devid)); } static void zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t *cache, const char *path, const char *name, int order) { avl_index_t where; rdsk_node_t *slice; slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { free(slice); return; } slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = cache; slice->rn_hdl = hdl; slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET; slice->rn_labelpaths = B_FALSE; pthread_mutex_lock(lock); if (avl_find(cache, slice, &where)) { free(slice->rn_name); free(slice); } else { avl_insert(cache, slice, where); } pthread_mutex_unlock(lock); } static int zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t *cache, const char *dir, int order) { int error; char path[MAXPATHLEN]; struct dirent64 *dp; DIR *dirp; if (realpath(dir, path) == NULL) { error = errno; if (error == ENOENT) return (0); zutil_error_aux(hdl, "%s", strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir); return (error); } dirp = opendir(path); if (dirp == NULL) { error = errno; zutil_error_aux(hdl, "%s", strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); return (error); } while ((dp = readdir64(dirp)) != NULL) { const char *name = dp->d_name; if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) continue; switch (dp->d_type) { case DT_UNKNOWN: case DT_BLK: case DT_LNK: #ifdef __FreeBSD__ case DT_CHR: #endif case DT_REG: break; default: continue; } zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); } (void) closedir(dirp); return (0); } static int zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t *cache, const char *dir, int order) { int error = 0; char path[MAXPATHLEN]; char *d = NULL; ssize_t dl; const char *dpath, *name; /* * Separate the directory and the basename. * We do this so that we can get the realpath of * the directory. We don't get the realpath on the * whole path because if it's a symlink, we want the * path of the symlink not where it points to. */ name = zfs_basename(dir); if ((dl = zfs_dirnamelen(dir)) == -1) dpath = "."; else dpath = d = zutil_strndup(hdl, dir, dl); if (realpath(dpath, path) == NULL) { error = errno; if (error == ENOENT) { error = 0; goto out; } zutil_error_aux(hdl, "%s", strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir); goto out; } zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); out: free(d); return (error); } /* * Scan a list of directories for zfs devices. */ static int zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t **slice_cache, const char * const *dir, size_t dirs) { avl_tree_t *cache; rdsk_node_t *slice; void *cookie; int i, error; *slice_cache = NULL; cache = zutil_alloc(hdl, sizeof (avl_tree_t)); avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); for (i = 0; i < dirs; i++) { struct stat sbuf; if (stat(dir[i], &sbuf) != 0) { error = errno; if (error == ENOENT) continue; zutil_error_aux(hdl, "%s", strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); goto error; } /* * If dir[i] is a directory, we walk through it and add all * the entries to the cache. If it's not a directory, we just * add it to the cache. */ if (S_ISDIR(sbuf.st_mode)) { if ((error = zpool_find_import_scan_dir(hdl, lock, cache, dir[i], i)) != 0) goto error; } else { if ((error = zpool_find_import_scan_path(hdl, lock, cache, dir[i], i)) != 0) goto error; } } *slice_cache = cache; return (0); error: cookie = NULL; while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { free(slice->rn_name); free(slice); } free(cache); return (error); } /* * Given a list of directories to search, find all pools stored on disk. This * includes partial pools which are not available to import. If no args are * given (argc is 0), then the default directory (/dev/dsk) is searched. * poolname or guid (but not both) are provided by the caller when trying * to import a specific pool. */ static nvlist_t * zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg, pthread_mutex_t *lock, avl_tree_t *cache) { nvlist_t *ret = NULL; pool_list_t pools = { 0 }; pool_entry_t *pe, *penext; vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; rdsk_node_t *slice; void *cookie; tpool_t *t; verify(iarg->poolname == NULL || iarg->guid == 0); /* * Create a thread pool to parallelize the process of reading and * validating labels, a large number of threads can be used due to * minimal contention. */ t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); for (slice = avl_first(cache); slice; (slice = avl_walk(cache, slice, AVL_AFTER))) (void) tpool_dispatch(t, zpool_open_func, slice); tpool_wait(t); tpool_destroy(t); /* * Process the cache, filtering out any entries which are not * for the specified pool then adding matching label configs. */ cookie = NULL; while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { if (slice->rn_config != NULL) { nvlist_t *config = slice->rn_config; boolean_t matched = B_TRUE; boolean_t aux = B_FALSE; int fd; /* * Check if it's a spare or l2cache device. If it is, * we need to skip the name and guid check since they * don't exist on aux device label. */ if (iarg->poolname != NULL || iarg->guid != 0) { uint64_t state; aux = nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) == 0 && (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE); } if (iarg->poolname != NULL && !aux) { char *pname; matched = nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && strcmp(iarg->poolname, pname) == 0; } else if (iarg->guid != 0 && !aux) { uint64_t this_guid; matched = nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && iarg->guid == this_guid; } if (matched) { /* * Verify all remaining entries can be opened * exclusively. This will prune all underlying * multipath devices which otherwise could * result in the vdev appearing as UNAVAIL. * * Under zdb, this step isn't required and * would prevent a zdb -e of active pools with * no cachefile. */ fd = open(slice->rn_name, O_RDONLY | O_EXCL | O_CLOEXEC); if (fd >= 0 || iarg->can_be_active) { if (fd >= 0) close(fd); add_config(hdl, &pools, slice->rn_name, slice->rn_order, slice->rn_num_labels, config); } } nvlist_free(config); } free(slice->rn_name); free(slice); } avl_destroy(cache); free(cache); ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); for (pe = pools.pools; pe != NULL; pe = penext) { penext = pe->pe_next; for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { venext = ve->ve_next; for (ce = ve->ve_configs; ce != NULL; ce = cenext) { cenext = ce->ce_next; nvlist_free(ce->ce_config); free(ce); } free(ve); } free(pe); } for (ne = pools.names; ne != NULL; ne = nenext) { nenext = ne->ne_next; free(ne->ne_name); free(ne); } return (ret); } /* * Given a config, discover the paths for the devices which * exist in the config. */ static int discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv, avl_tree_t *cache, pthread_mutex_t *lock) { char *path = NULL; ssize_t dl; uint_t children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (int c = 0; c < children; c++) { discover_cached_paths(hdl, child[c], cache, lock); } } /* * Once we have the path, we need to add the directory to * our directory cache. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { if ((dl = zfs_dirnamelen(path)) == -1) path = "."; else path[dl] = '\0'; return (zpool_find_import_scan_dir(hdl, lock, cache, path, 0)); } return (0); } /* * Given a cache file, return the contents as a list of importable pools. * poolname or guid (but not both) are provided by the caller when trying * to import a specific pool. */ static nvlist_t * zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) { char *buf; int fd; struct stat64 statbuf; nvlist_t *raw, *src, *dst; nvlist_t *pools; nvpair_t *elem; char *name; uint64_t this_guid; boolean_t active; verify(iarg->poolname == NULL || iarg->guid == 0); if ((fd = open(iarg->cachefile, O_RDONLY | O_CLOEXEC)) < 0) { zutil_error_aux(hdl, "%s", strerror(errno)); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to open cache file")); return (NULL); } if (fstat64(fd, &statbuf) != 0) { zutil_error_aux(hdl, "%s", strerror(errno)); (void) close(fd); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to get size of cache file")); return (NULL); } if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) { (void) close(fd); return (NULL); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) close(fd); free(buf); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to read cache file contents")); return (NULL); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { free(buf); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "invalid or corrupt cache file contents")); return (NULL); } free(buf); /* * Go through and get the current state of the pools and refresh their * state. */ if (nvlist_alloc(&pools, 0, 0) != 0) { (void) zutil_no_memory(hdl); nvlist_free(raw); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { src = fnvpair_value_nvlist(elem); name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0) continue; this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); if (iarg->guid != 0 && iarg->guid != this_guid) continue; if (zutil_pool_active(hdl, name, this_guid, &active) != 0) { nvlist_free(raw); nvlist_free(pools); return (NULL); } if (active) continue; if (iarg->scan) { uint64_t saved_guid = iarg->guid; const char *saved_poolname = iarg->poolname; pthread_mutex_t lock; /* * Create the device cache that will hold the * devices we will scan based on the cachefile. * This will get destroyed and freed by * zpool_find_import_impl. */ avl_tree_t *cache = zutil_alloc(hdl, sizeof (avl_tree_t)); avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); nvlist_t *nvroot = fnvlist_lookup_nvlist(src, ZPOOL_CONFIG_VDEV_TREE); /* * We only want to find the pool with this_guid. * We will reset these values back later. */ iarg->guid = this_guid; iarg->poolname = NULL; /* * We need to build up a cache of devices that exists * in the paths pointed to by the cachefile. This allows * us to preserve the device namespace that was * originally specified by the user but also lets us * scan devices in those directories in case they had * been renamed. */ pthread_mutex_init(&lock, NULL); discover_cached_paths(hdl, nvroot, cache, &lock); nvlist_t *nv = zpool_find_import_impl(hdl, iarg, &lock, cache); pthread_mutex_destroy(&lock); /* * zpool_find_import_impl will return back * a list of pools that it found based on the * device cache. There should only be one pool * since we're looking for a specific guid. * We will use that pool to build up the final * pool nvlist which is returned back to the * caller. */ nvpair_t *pair = nvlist_next_nvpair(nv, NULL); fnvlist_add_nvlist(pools, nvpair_name(pair), fnvpair_value_nvlist(pair)); VERIFY3P(nvlist_next_nvpair(nv, pair), ==, NULL); iarg->guid = saved_guid; iarg->poolname = saved_poolname; continue; } if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, iarg->cachefile) != 0) { (void) zutil_no_memory(hdl); nvlist_free(raw); nvlist_free(pools); return (NULL); } + update_vdevs_config_dev_sysfs_path(src); + if ((dst = zutil_refresh_config(hdl, src)) == NULL) { nvlist_free(raw); nvlist_free(pools); return (NULL); } if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { (void) zutil_no_memory(hdl); nvlist_free(dst); nvlist_free(raw); nvlist_free(pools); return (NULL); } nvlist_free(dst); } nvlist_free(raw); return (pools); } static nvlist_t * zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg) { pthread_mutex_t lock; avl_tree_t *cache; nvlist_t *pools = NULL; verify(iarg->poolname == NULL || iarg->guid == 0); pthread_mutex_init(&lock, NULL); /* * Locate pool member vdevs by blkid or by directory scanning. * On success a newly allocated AVL tree which is populated with an * entry for each discovered vdev will be returned in the cache. * It's the caller's responsibility to consume and destroy this tree. */ if (iarg->scan || iarg->paths != 0) { size_t dirs = iarg->paths; const char * const *dir = (const char * const *)iarg->path; if (dirs == 0) dir = zpool_default_search_paths(&dirs); if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0) { pthread_mutex_destroy(&lock); return (NULL); } } else { if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) { pthread_mutex_destroy(&lock); return (NULL); } } pools = zpool_find_import_impl(hdl, iarg, &lock, cache); pthread_mutex_destroy(&lock); return (pools); } nvlist_t * zpool_search_import(void *hdl, importargs_t *import, const pool_config_ops_t *pco) { libpc_handle_t handle = { 0 }; nvlist_t *pools = NULL; handle.lpc_lib_handle = hdl; handle.lpc_ops = pco; handle.lpc_printerr = B_TRUE; verify(import->poolname == NULL || import->guid == 0); if (import->cachefile != NULL) pools = zpool_find_import_cached(&handle, import); else pools = zpool_find_import(&handle, import); if ((pools == NULL || nvlist_empty(pools)) && handle.lpc_open_access_error && geteuid() != 0) { (void) zutil_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN, "no pools found")); } return (pools); } static boolean_t pool_match(nvlist_t *cfg, char *tgt) { uint64_t v, guid = strtoull(tgt, NULL, 0); char *s; if (guid != 0) { if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) return (v == guid); } else { if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) return (strcmp(s, tgt) == 0); } return (B_FALSE); } int zpool_find_config(void *hdl, const char *target, nvlist_t **configp, importargs_t *args, const pool_config_ops_t *pco) { nvlist_t *pools; nvlist_t *match = NULL; nvlist_t *config = NULL; char *sepp = NULL; int count = 0; char *targetdup = strdup(target); *configp = NULL; if ((sepp = strpbrk(targetdup, "/@")) != NULL) *sepp = '\0'; pools = zpool_search_import(hdl, args, pco); if (pools != NULL) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { VERIFY0(nvpair_value_nvlist(elem, &config)); if (pool_match(config, targetdup)) { count++; if (match != NULL) { /* multiple matches found */ continue; } else { match = fnvlist_dup(config); } } } fnvlist_free(pools); } if (count == 0) { free(targetdup); return (ENOENT); } if (count > 1) { free(targetdup); fnvlist_free(match); return (EINVAL); } *configp = match; free(targetdup); return (0); } + +/* + * Internal function for iterating over the vdevs. + * + * For each vdev, func() will be called and will be passed 'zhp' (which is + * typically the zpool_handle_t cast as a void pointer), the vdev's nvlist, and + * a user-defined data pointer). + * + * The return values from all the func() calls will be OR'd together and + * returned. + */ +int +for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data) +{ + nvlist_t **child; + uint_t c, children; + int ret = 0; + int i; + char *type; + + const char *list[] = { + ZPOOL_CONFIG_SPARES, + ZPOOL_CONFIG_L2CACHE, + ZPOOL_CONFIG_CHILDREN + }; + + for (i = 0; i < ARRAY_SIZE(list); i++) { + if (nvlist_lookup_nvlist_array(nv, list[i], &child, + &children) == 0) { + for (c = 0; c < children; c++) { + uint64_t ishole = 0; + + (void) nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole); + + if (ishole) + continue; + + ret |= for_each_vdev_cb(zhp, child[c], + func, data); + } + } + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (ret); + + /* Don't run our function on root vdevs */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0) { + ret |= func(zhp, nv, data); + } + + return (ret); +} + +/* + * Given an ZPOOL_CONFIG_VDEV_TREE nvpair, iterate over all the vdevs, calling + * func() for each one. func() is passed the vdev's nvlist and an optional + * user-defined 'data' pointer. + */ +int +for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data) +{ + return (for_each_vdev_cb(NULL, nvroot, func, data)); +} diff --git a/sys/contrib/openzfs/man/man8/zfs-send.8 b/sys/contrib/openzfs/man/man8/zfs-send.8 index 688bd033979a..e83a92e4b341 100644 --- a/sys/contrib/openzfs/man/man8/zfs-send.8 +++ b/sys/contrib/openzfs/man/man8/zfs-send.8 @@ -1,652 +1,655 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" .Dd April 15, 2021 .Dt ZFS-SEND 8 .Os . .Sh NAME .Nm zfs-send .Nd generate backup stream of ZFS dataset .Sh SYNOPSIS .Nm zfs .Cm send .Op Fl DLPRbcehnpsvw .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot .Ar snapshot .Nm zfs .Cm send .Op Fl DLPcensvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPcenpv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar snapshot .Nm zfs .Cm send .Op Fl Penv .Fl t .Ar receive_resume_token .Nm zfs .Cm send .Op Fl Pnv .Fl S Ar filesystem .Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns … . .Sh DESCRIPTION .Bl -tag -width "" .It Xo .Nm zfs .Cm send .Op Fl DLPRbcehnpvw .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot .Ar snapshot .Xc Creates a stream representation of the second .Ar snapshot , which is written to standard output. The output can be redirected to a file or to a different system .Po for example, using .Xr ssh 1 .Pc . By default, a full stream is generated. .Bl -tag -width "-D" .It Fl D , -dedup Deduplicated send is no longer supported. This flag is accepted for backwards compatibility, but a regular, non-deduplicated stream will be generated. .It Fl I Ar snapshot Generate a stream package that sends all intermediary snapshots from the first snapshot to the second snapshot. For example, .Fl I Em @a Em fs@d is similar to .Fl i Em @a Em fs@b Ns \&; Fl i Em @b Em fs@c Ns \&; Fl i Em @c Em fs@d . The incremental source may be specified as with the .Fl i option. .It Fl L , -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl P , -parsable Print machine-parsable verbose information about the stream package generated. .It Fl R , -replicate Generate a replication stream package, which will replicate the specified file system, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are preserved. .Pp If the .Fl i or .Fl I flags are used in conjunction with the .Fl R flag, an incremental replication stream is generated. The current values of properties, and current snapshot and file system names are set when the stream is received. If the .Fl F flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. If the .Fl R flag is used to send encrypted datasets, then .Fl w must also be specified. .It Fl e , -embed Generate a more compact stream by using .Sy WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the .Sy embedded_data feature. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl b , -backup Sends only received property values whether or not they are overridden by local settings, but only if the dataset has ever been received. Use this option when you want .Nm zfs Cm receive to restore received properties backed up on the sent dataset and to avoid sending local settings that may have nothing to do with the source dataset, but only with how the data is backed up. .It Fl c , -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory .Po see the .Sy compression property for details .Pc . If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c , then the data will be decompressed before sending so it can be split into smaller block sizes. Streams sent with .Fl c will not have their data recompressed on the receiver side using .Fl o Sy compress Ns = Ar value . The data will stay compressed as it was from the sender. The new compression property will be set for future data. +Note that uncompressed data from the sender will still attempt to +compress on the receiver, unless you specify +.Fl o Sy compress Ns = Em off . .It Fl w , -raw For encrypted datasets, send data exactly as it exists on disk. This allows backups to be taken even if encryption keys are not currently loaded. The backup may then be received on an untrusted machine since that machine will not have the encryption keys to read the protected data or alter it without being detected. Upon being received, the dataset will have the same encryption keys as it did on the send side, although the .Sy keylocation property will be defaulted to .Sy prompt if not otherwise provided. For unencrypted datasets, this flag will be equivalent to .Fl Lec . Note that if you do not use this flag for sending encrypted datasets, data will be sent unencrypted and may be re-encrypted with a different encryption key on the receiving system, which will disable the ability to do a raw send to that system for incrementals. .It Fl h , -holds Generate a stream package that includes any snapshot holds (created with the .Nm zfs Cm hold command), and indicating to .Nm zfs Cm receive that the holds be applied to the dataset on the receiving system. .It Fl i Ar snapshot Generate an incremental stream from the first .Ar snapshot .Pq the incremental source to the second .Ar snapshot .Pq the incremental target . The incremental source can be specified as the last component of the snapshot name .Po the .Sy @ character and following .Pc and it is assumed to be from the same file system as the incremental target. .Pp If the destination is a clone, the source may be the origin snapshot, which must be fully specified .Po for example, .Em pool/fs@origin , not just .Em @origin .Pc . .It Fl n , -dryrun Do a dry-run .Pq Qq No-op send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output .Po contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error .Pc . .It Fl p , -props Include the dataset's properties in the stream. This flag is implicit when .Fl R is specified. The receiving system must also support this feature. Sends of encrypted datasets must use .Fl w when using this flag. .It Fl s , -skip-missing Allows sending a replication stream even when there are snapshots missing in the hierarchy. When a snapshot is missing, instead of throwing an error and aborting the send, a warning is printed to the standard error stream and the dataset to which it belongs and its descendents are skipped. This flag can only be used in conjunction with .Fl R . .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .Pp The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. .El .It Xo .Nm zfs .Cm send .Op Fl DLPcenvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. If the destination is a filesystem or volume, the pool must be read-only, or the filesystem must not be mounted. When the stream generated from a filesystem or volume is received, the default snapshot name will be .Qq --head-- . .Bl -tag -width "-D" .It Fl D , -dedup Deduplicated send is no longer supported. This flag is accepted for backwards compatibility, but a regular, non-deduplicated stream will be generated. .It Fl L , -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl P , -parsable Print machine-parsable verbose information about the stream package generated. .It Fl c , -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory .Po see the .Sy compression property for details .Pc . If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c , then the data will be decompressed before sending so it can be split into smaller block sizes. .It Fl w , -raw For encrypted datasets, send data exactly as it exists on disk. This allows backups to be taken even if encryption keys are not currently loaded. The backup may then be received on an untrusted machine since that machine will not have the encryption keys to read the protected data or alter it without being detected. Upon being received, the dataset will have the same encryption keys as it did on the send side, although the .Sy keylocation property will be defaulted to .Sy prompt if not otherwise provided. For unencrypted datasets, this flag will be equivalent to .Fl Lec . Note that if you do not use this flag for sending encrypted datasets, data will be sent unencrypted and may be re-encrypted with a different encryption key on the receiving system, which will disable the ability to do a raw send to that system for incrementals. .It Fl e , -embed Generate a more compact stream by using .Sy WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the .Sy embedded_data feature. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl i Ar snapshot Ns | Ns Ar bookmark Generate an incremental send stream. The incremental source must be an earlier snapshot in the destination's history. It will commonly be an earlier snapshot in the destination's file system, in which case it can be specified as the last component of the name .Po the .Sy # or .Sy @ character and following .Pc . .Pp If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. .It Fl n , -dryrun Do a dry-run .Pq Qq No-op send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output .Po contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error .Pc . .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .El .It Xo .Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPcenpv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar snapshot .Xc Generate a redacted send stream. This send stream contains all blocks from the snapshot being sent that aren't included in the redaction list contained in the bookmark specified by the .Fl -redact (or .Fl d ) flag. The resulting send stream is said to be redacted with respect to the snapshots the bookmark specified by the .Fl -redact No flag was created with. The bookmark must have been created by running .Nm zfs Cm redact on the snapshot being sent. .Pp This feature can be used to allow clones of a filesystem to be made available on a remote system, in the case where their parent need not (or needs to not) be usable. For example, if a filesystem contains sensitive data, and it has clones where that sensitive data has been secured or replaced with dummy data, redacted sends can be used to replicate the secured data without replicating the original sensitive data, while still sharing all possible blocks. A snapshot that has been redacted with respect to a set of snapshots will contain all blocks referenced by at least one snapshot in the set, but will contain none of the blocks referenced by none of the snapshots in the set. In other words, if all snapshots in the set have modified a given block in the parent, that block will not be sent; but if one or more snapshots have not modified a block in the parent, they will still reference the parent's block, so that block will be sent. Note that only user data will be redacted. .Pp When the redacted send stream is received, we will generate a redacted snapshot. Due to the nature of redaction, a redacted dataset can only be used in the following ways: .Bl -enum -width "a." .It To receive, as a clone, an incremental send from the original snapshot to one of the snapshots it was redacted with respect to. In this case, the stream will produce a valid dataset when received because all blocks that were redacted in the parent are guaranteed to be present in the child's send stream. This use case will produce a normal snapshot, which can be used just like other snapshots. . .It To receive an incremental send from the original snapshot to something redacted with respect to a subset of the set of snapshots the initial snapshot was redacted with respect to. In this case, each block that was redacted in the original is still redacted (redacting with respect to additional snapshots causes less data to be redacted (because the snapshots define what is permitted, and everything else is redacted)). This use case will produce a new redacted snapshot. .It To receive an incremental send from a redaction bookmark of the original snapshot that was created when redacting with respect to a subset of the set of snapshots the initial snapshot was created with respect to anything else. A send stream from such a redaction bookmark will contain all of the blocks necessary to fill in any redacted data, should it be needed, because the sending system is aware of what blocks were originally redacted. This will either produce a normal snapshot or a redacted one, depending on whether the new send stream is redacted. .It To receive an incremental send from a redacted version of the initial snapshot that is redacted with respect to a subject of the set of snapshots the initial snapshot was created with respect to. A send stream from a compatible redacted dataset will contain all of the blocks necessary to fill in any redacted data. This will either produce a normal snapshot or a redacted one, depending on whether the new send stream is redacted. .It To receive a full send as a clone of the redacted snapshot. Since the stream is a full send, it definitionally contains all the data needed to create a new dataset. This use case will either produce a normal snapshot or a redacted one, depending on whether the full send stream was redacted. .El .Pp These restrictions are detected and enforced by .Nm zfs Cm receive ; a redacted send stream will contain the list of snapshots that the stream is redacted with respect to. These are stored with the redacted snapshot, and are used to detect and correctly handle the cases above. Note that for technical reasons, raw sends and redacted sends cannot be combined at this time. .It Xo .Nm zfs .Cm send .Op Fl Penv .Fl t .Ar receive_resume_token .Xc Creates a send stream which resumes an interrupted receive. The .Ar receive_resume_token is the value of this property on the filesystem or volume that was being received into. See the documentation for .Nm zfs Cm receive Fl s for more details. .It Xo .Nm zfs .Cm send .Op Fl Pnv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Fl S .Ar filesystem .Xc Generate a send stream from a dataset that has been partially received. .Bl -tag -width "-L" .It Fl S , -saved This flag requires that the specified filesystem previously received a resumable send that did not finish and was interrupted. In such scenarios this flag enables the user to send this partially received state. Using this flag will always use the last fully received snapshot as the incremental source if it exists. .El .It Xo .Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns … .Xc Generate a new redaction bookmark. In addition to the typical bookmark information, a redaction bookmark contains the list of redacted blocks and the list of redaction snapshots specified. The redacted blocks are blocks in the snapshot which are not referenced by any of the redaction snapshots. These blocks are found by iterating over the metadata in each redaction snapshot to determine what has been changed since the target snapshot. Redaction is designed to support redacted zfs sends; see the entry for .Nm zfs Cm send for more information on the purpose of this operation. If a redact operation fails partway through (due to an error or a system failure), the redaction can be resumed by rerunning the same command. .El .Ss Redaction ZFS has support for a limited version of data subsetting, in the form of redaction. Using the .Nm zfs Cm redact command, a .Sy redaction bookmark can be created that stores a list of blocks containing sensitive information. When provided to .Nm zfs Cm send , this causes a .Sy redacted send to occur. Redacted sends omit the blocks containing sensitive information, replacing them with REDACT records. When these send streams are received, a .Sy redacted dataset is created. A redacted dataset cannot be mounted by default, since it is incomplete. It can be used to receive other send streams. In this way datasets can be used for data backup and replication, with all the benefits that zfs send and receive have to offer, while protecting sensitive information from being stored on less-trusted machines or services. .Pp For the purposes of redaction, there are two steps to the process. A redact step, and a send/receive step. First, a redaction bookmark is created. This is done by providing the .Nm zfs Cm redact command with a parent snapshot, a bookmark to be created, and a number of redaction snapshots. These redaction snapshots must be descendants of the parent snapshot, and they should modify data that is considered sensitive in some way. Any blocks of data modified by all of the redaction snapshots will be listed in the redaction bookmark, because it represents the truly sensitive information. When it comes to the send step, the send process will not send the blocks listed in the redaction bookmark, instead replacing them with REDACT records. When received on the target system, this will create a redacted dataset, missing the data that corresponds to the blocks in the redaction bookmark on the sending system. The incremental send streams from the original parent to the redaction snapshots can then also be received on the target system, and this will produce a complete snapshot that can be used normally. Incrementals from one snapshot on the parent filesystem and another can also be done by sending from the redaction bookmark, rather than the snapshots themselves. .Pp In order to make the purpose of the feature more clear, an example is provided. Consider a zfs filesystem containing four files. These files represent information for an online shopping service. One file contains a list of usernames and passwords, another contains purchase histories, a third contains click tracking data, and a fourth contains user preferences. The owner of this data wants to make it available for their development teams to test against, and their market research teams to do analysis on. The development teams need information about user preferences and the click tracking data, while the market research teams need information about purchase histories and user preferences. Neither needs access to the usernames and passwords. However, because all of this data is stored in one ZFS filesystem, it must all be sent and received together. In addition, the owner of the data wants to take advantage of features like compression, checksumming, and snapshots, so they do want to continue to use ZFS to store and transmit their data. Redaction can help them do so. First, they would make two clones of a snapshot of the data on the source. In one clone, they create the setup they want their market research team to see; they delete the usernames and passwords file, and overwrite the click tracking data with dummy information. In another, they create the setup they want the development teams to see, by replacing the passwords with fake information and replacing the purchase histories with randomly generated ones. They would then create a redaction bookmark on the parent snapshot, using snapshots on the two clones as redaction snapshots. The parent can then be sent, redacted, to the target server where the research and development teams have access. Finally, incremental sends from the parent snapshot to each of the clones can be sent to and received on the target server; these snapshots are identical to the ones on the source, and are ready to be used, while the parent snapshot on the target contains none of the username and password data present on the source, because it was removed by the redacted send operation. . .Sh SEE ALSO .Xr zfs-bookmark 8 , .Xr zfs-receive 8 , .Xr zfs-redact 8 , .Xr zfs-snapshot 8 diff --git a/sys/contrib/openzfs/man/man8/zfs.8 b/sys/contrib/openzfs/man/man8/zfs.8 index fca1ba00da7d..48453ef46c0d 100644 --- a/sys/contrib/openzfs/man/man8/zfs.8 +++ b/sys/contrib/openzfs/man/man8/zfs.8 @@ -1,773 +1,773 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. .\" Copyright (c) 2011, Pawel Jakub Dawidek .\" Copyright (c) 2012, Glen Barber .\" Copyright (c) 2012, Bryan Drewery .\" Copyright (c) 2013, Steven Hartland .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] .\" Copyright (c) 2014, Xin LI .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" .Dd June 30, 2019 .Dt ZFS 8 .Os . .Sh NAME .Nm zfs .Nd configure ZFS datasets .Sh SYNOPSIS .Nm .Fl ?V .Nm .Cm version .Nm .Cm subcommand .Op Ar arguments . .Sh DESCRIPTION The .Nm command configures ZFS datasets within a ZFS storage pool, as described in .Xr zpool 8 . A dataset is identified by a unique path within the ZFS namespace. For example: .Dl pool/{filesystem,volume,snapshot} .Pp where the maximum length of a dataset name is .Sy MAXNAMELEN Pq 256B and the maximum amount of nesting allowed in a path is 50 levels deep. .Pp A dataset can be one of the following: .Bl -tag -offset Ds -width "file system" .It Sy file system Can be mounted within the standard system namespace and behaves like other file systems. While ZFS file systems are designed to be POSIX-compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to non-standard behavior when checking file system free space. .It Sy volume A logical volume exported as a raw or block device. This type of dataset should only be used when a block device is required. File systems are typically used in most environments. .It Sy snapshot A read-only version of a file system or volume at a given point in time. It is specified as .Ar filesystem Ns @ Ns Ar name or .Ar volume Ns @ Ns Ar name . .It Sy bookmark Much like a .Sy snapshot , but without the hold on on-disk data. It can be used as the source of a send (but not for a receive). It is specified as .Ar filesystem Ns # Ns Ar name or .Ar volume Ns # Ns Ar name . .El .Pp See .Xr zfsconcepts 7 for details. . .Ss Properties Properties are divided into two types: native properties and user-defined .Pq or Qq user properties. Native properties either export internal statistics or control ZFS behavior. In addition, native properties are either editable or read-only. User properties have no effect on ZFS behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about properties, see .Xr zfsprops 7 . . .Ss Encryption Enabling the .Sy encryption feature allows for the creation of encrypted filesystems and volumes. ZFS will encrypt file and zvol data, file attributes, ACLs, permission bits, directory listings, FUID mappings, and .Sy userused Ns / Ns Sy groupused Ns / Ns Sy projectused data. For an overview of encryption, see .Xr zfs-load-key 8 . . .Sh SUBCOMMANDS All subcommands that modify state are logged persistently to the pool in their original form. .Bl -tag -width "" .It Nm Fl ? Displays a help message. .It Xo .Nm .Fl V , -version .Xc .It Xo .Nm .Cm version .Xc Displays the software version of the .Nm userland utility and the zfs kernel module. .El . .Ss Dataset Management .Bl -tag -width "" .It Xr zfs-list 8 Lists the property information for the given datasets in tabular form. .It Xr zfs-create 8 Creates a new ZFS file system or volume. .It Xr zfs-destroy 8 Destroys the given dataset(s), snapshot(s), or bookmark. .It Xr zfs-rename 8 Renames the given dataset (filesystem or snapshot). .It Xr zfs-upgrade 8 Manage upgrading the on-disk version of filesystems. .El . .Ss Snapshots .Bl -tag -width "" .It Xr zfs-snapshot 8 Creates snapshots with the given names. .It Xr zfs-rollback 8 Roll back the given dataset to a previous snapshot. .It Xr zfs-hold 8 Ns / Ns Xr zfs-release 8 Add or remove a hold reference to the specified snapshot or snapshots. If a hold exists on a snapshot, attempts to destroy that snapshot by using the .Nm zfs Cm destroy command return .Sy EBUSY . .It Xr zfs-diff 8 Display the difference between a snapshot of a given filesystem and another snapshot of that filesystem from a later time or the current contents of the filesystem. .El . .Ss Clones .Bl -tag -width "" .It Xr zfs-clone 8 Creates a clone of the given snapshot. .It Xr zfs-promote 8 Promotes a clone file system to no longer be dependent on its .Qq origin snapshot. .El . .Ss Send & Receive .Bl -tag -width "" .It Xr zfs-send 8 Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. .It Xr zfs-receive 8 Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the .Xr zfs-send 8 subcommand, which by default creates a full stream. .It Xr zfs-bookmark 8 Creates a new bookmark of the given snapshot or bookmark. Bookmarks mark the point in time when the snapshot was created, and can be used as the incremental source for a .Nm zfs Cm send command. .It Xr zfs-redact 8 Generate a new redaction bookmark. This feature can be used to allow clones of a filesystem to be made available on a remote system, in the case where their parent need not (or needs to not) be usable. .El . .Ss Properties .Bl -tag -width "" .It Xr zfs-get 8 Displays properties for the given datasets. .It Xr zfs-set 8 Sets the property or list of properties to the given value(s) for each dataset. .It Xr zfs-inherit 8 Clears the specified property, causing it to be inherited from an ancestor, restored to default if no ancestor has the property set, or with the .Fl S option reverted to the received value if one exists. .El . .Ss Quotas .Bl -tag -width "" .It Xr zfs-userspace 8 Ns / Ns Xr zfs-groupspace 8 Ns / Ns Xr zfs-projectspace 8 Displays space consumed by, and quotas on, each user, group, or project in the specified filesystem or snapshot. .It Xr zfs-project 8 List, set, or clear project ID and/or inherit flag on the file(s) or directories. .El . .Ss Mountpoints .Bl -tag -width "" .It Xr zfs-mount 8 Displays all ZFS file systems currently mounted, or mount ZFS filesystem on a path described by its .Sy mountpoint property. .It Xr zfs-unmount 8 Unmounts currently mounted ZFS file systems. .El . .Ss Shares .Bl -tag -width "" .It Xr zfs-share 8 Shares available ZFS file systems. .It Xr zfs-unshare 8 Unshares currently shared ZFS file systems. .El . .Ss Delegated Administration .Bl -tag -width "" .It Xr zfs-allow 8 Delegate permissions on the specified filesystem or volume. .It Xr zfs-unallow 8 Remove delegated permissions on the specified filesystem or volume. .El . .Ss Encryption .Bl -tag -width "" .It Xr zfs-change-key 8 Add or change an encryption key on the specified dataset. .It Xr zfs-load-key 8 Load the key for the specified encrypted dataset, enabling access. .It Xr zfs-unload-key 8 Unload a key for the specified dataset, removing the ability to access the dataset. .El . .Ss Channel Programs .Bl -tag -width "" .It Xr zfs-program 8 Execute ZFS administrative operations programmatically via a Lua script-language channel program. .El . .Ss Jails .Bl -tag -width "" .It Xr zfs-jail 8 Attaches a filesystem to a jail. .It Xr zfs-unjail 8 Detaches a filesystem from a jail. .El . .Ss Waiting .Bl -tag -width "" .It Xr zfs-wait 8 Wait for background activity in a filesystem to complete. .El . .Sh EXIT STATUS The .Nm utility exits .Sy 0 on success, .Sy 1 if an error occurs, and .Sy 2 if invalid command line options were specified. . .Sh EXAMPLES .Bl -tag -width "" . .It Sy Example 1 : No Creating a ZFS File System Hierarchy The following commands create a file system named .Ar pool/home and a file system named .Ar pool/home/bob . The mount point .Pa /export/home is set for the parent file system, and is automatically inherited by the child file system. .Dl # Nm zfs Cm create Ar pool/home .Dl # Nm zfs Cm set Sy mountpoint Ns = Ns Ar /export/home pool/home .Dl # Nm zfs Cm create Ar pool/home/bob . .It Sy Example 2 : No Creating a ZFS Snapshot The following command creates a snapshot named .Ar yesterday . This snapshot is mounted on demand in the .Pa .zfs/snapshot directory at the root of the .Ar pool/home/bob file system. .Dl # Nm zfs Cm snapshot Ar pool/home/bob Ns @ Ns Ar yesterday . .It Sy Example 3 : No Creating and Destroying Multiple Snapshots The following command creates snapshots named .Ar yesterday No of Ar pool/home and all of its descendent file systems. Each snapshot is mounted on demand in the .Pa .zfs/snapshot directory at the root of its file system. The second command destroys the newly created snapshots. .Dl # Nm zfs Cm snapshot Fl r Ar pool/home Ns @ Ns Ar yesterday .Dl # Nm zfs Cm destroy Fl r Ar pool/home Ns @ Ns Ar yesterday . .It Sy Example 4 : No Disabling and Enabling File System Compression The following command disables the .Sy compression property for all file systems under .Ar pool/home . The next command explicitly enables .Sy compression for .Ar pool/home/anne . .Dl # Nm zfs Cm set Sy compression Ns = Ns Sy off Ar pool/home .Dl # Nm zfs Cm set Sy compression Ns = Ns Sy on Ar pool/home/anne . .It Sy Example 5 : No Listing ZFS Datasets The following command lists all active file systems and volumes in the system. Snapshots are displayed if .Sy listsnaps Ns = Ns Sy on . The default is .Sy off . See .Xr zpoolprops 7 for more information on pool properties. .Bd -literal -compact -offset Ds .No # Nm zfs Cm list NAME USED AVAIL REFER MOUNTPOINT pool 450K 457G 18K /pool pool/home 315K 457G 21K /export/home pool/home/anne 18K 457G 18K /export/home/anne pool/home/bob 276K 457G 276K /export/home/bob .Ed . .It Sy Example 6 : No Setting a Quota on a ZFS File System The following command sets a quota of 50 Gbytes for .Ar pool/home/bob : .Dl # Nm zfs Cm set Sy quota Ns = Ns Ar 50G pool/home/bob . .It Sy Example 7 : No Listing ZFS Properties The following command lists all properties for .Ar pool/home/bob : .Bd -literal -compact -offset Ds .No # Nm zfs Cm get Sy all Ar pool/home/bob NAME PROPERTY VALUE SOURCE pool/home/bob type filesystem - pool/home/bob creation Tue Jul 21 15:53 2009 - pool/home/bob used 21K - pool/home/bob available 20.0G - pool/home/bob referenced 21K - pool/home/bob compressratio 1.00x - pool/home/bob mounted yes - pool/home/bob quota 20G local pool/home/bob reservation none default pool/home/bob recordsize 128K default pool/home/bob mountpoint /pool/home/bob default pool/home/bob sharenfs off default pool/home/bob checksum on default pool/home/bob compression on local pool/home/bob atime on default pool/home/bob devices on default pool/home/bob exec on default pool/home/bob setuid on default pool/home/bob readonly off default pool/home/bob zoned off default pool/home/bob snapdir hidden default pool/home/bob acltype off default pool/home/bob aclmode discard default pool/home/bob aclinherit restricted default pool/home/bob canmount on default pool/home/bob xattr on default pool/home/bob copies 1 default pool/home/bob version 4 - pool/home/bob utf8only off - pool/home/bob normalization none - pool/home/bob casesensitivity sensitive - pool/home/bob vscan off default pool/home/bob nbmand off default pool/home/bob sharesmb off default pool/home/bob refquota none default pool/home/bob refreservation none default pool/home/bob primarycache all default pool/home/bob secondarycache all default pool/home/bob usedbysnapshots 0 - pool/home/bob usedbydataset 21K - pool/home/bob usedbychildren 0 - pool/home/bob usedbyrefreservation 0 - .Ed .Pp The following command gets a single property value: .Bd -literal -compact -offset Ds .No # Nm zfs Cm get Fl H o Sy value compression Ar pool/home/bob on .Ed .Pp The following command lists all properties with local settings for .Ar pool/home/bob : .Bd -literal -compact -offset Ds .No # Nm zfs Cm get Fl r s Sy local Fl o Sy name , Ns Sy property , Ns Sy value all Ar pool/home/bob NAME PROPERTY VALUE pool/home/bob quota 20G pool/home/bob compression on .Ed . .It Sy Example 8 : No Rolling Back a ZFS File System The following command reverts the contents of .Ar pool/home/anne to the snapshot named .Ar yesterday , deleting all intermediate snapshots: .Dl # Nm zfs Cm rollback Fl r Ar pool/home/anne Ns @ Ns Ar yesterday . .It Sy Example 9 : No Creating a ZFS Clone The following command creates a writable file system whose initial contents are the same as .Ar pool/home/bob@yesterday . .Dl # Nm zfs Cm clone Ar pool/home/bob@yesterday pool/clone . .It Sy Example 10 : No Promoting a ZFS Clone The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming: .Bd -literal -compact -offset Ds .No # Nm zfs Cm create Ar pool/project/production populate /pool/project/production with data .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta make changes to /pool/project/beta and test them .No # Nm zfs Cm promote Ar pool/project/beta .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production once the legacy version is no longer needed, it can be destroyed .No # Nm zfs Cm destroy Ar pool/project/legacy .Ed . .It Sy Example 11 : No Inheriting ZFS Properties The following command causes .Ar pool/home/bob No and Ar pool/home/anne to inherit the .Sy checksum property from their parent. .Dl # Nm zfs Cm inherit Sy checksum Ar pool/home/bob pool/home/anne . .It Sy Example 12 : No Remotely Replicating ZFS Data The following commands send a full stream and then an incremental stream to a remote machine, restoring them into .Em poolB/received/fs@a and .Em poolB/received/fs@b , respectively. .Em poolB must contain the file system .Em poolB/received , and must not initially contain .Em poolB/received/fs . .Bd -literal -compact -offset Ds .No # Nm zfs Cm send Ar pool/fs@a | .No " " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs Ns @ Ns Ar a .No # Nm zfs Cm send Fl i Ar a pool/fs@b | .No " " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs .Ed . .It Sy Example 13 : No Using the Nm zfs Cm receive Fl d No Option The following command sends a full stream of .Ar poolA/fsA/fsB@snap to a remote machine, receiving it into .Ar poolB/received/fsA/fsB@snap . The .Ar fsA/fsB@snap portion of the received snapshot's name is determined from the name of the sent snapshot. .Ar poolB must contain the file system .Ar poolB/received . If .Ar poolB/received/fsA does not exist, it is created as an empty file system. .Bd -literal -compact -offset Ds .No # Nm zfs Cm send Ar poolA/fsA/fsB@snap | .No " " Nm ssh Ar host Nm zfs Cm receive Fl d Ar poolB/received .Ed . .It Sy Example 14 : No Setting User Properties The following example sets the user-defined .Ar com.example : Ns Ar department property for a dataset: .Dl # Nm zfs Cm set Ar com.example : Ns Ar department Ns = Ns Ar 12345 tank/accounting . .It Sy Example 15 : No Performing a Rolling Snapshot The following example shows how to maintain a history of snapshots with a consistent naming scheme. To keep a week's worth of snapshots, the user destroys the oldest snapshot, renames the remaining snapshots, and then creates a new snapshot, as follows: .Bd -literal -compact -offset Ds .No # Nm zfs Cm destroy Fl r Ar pool/users@7daysago .No # Nm zfs Cm rename Fl r Ar pool/users@6daysago No @ Ns Ar 7daysago .No # Nm zfs Cm rename Fl r Ar pool/users@5daysago No @ Ns Ar 6daysago .No # Nm zfs Cm rename Fl r Ar pool/users@4daysago No @ Ns Ar 5daysago .No # Nm zfs Cm rename Fl r Ar pool/users@3daysago No @ Ns Ar 4daysago .No # Nm zfs Cm rename Fl r Ar pool/users@2daysago No @ Ns Ar 3daysago .No # Nm zfs Cm rename Fl r Ar pool/users@yesterday No @ Ns Ar 2daysago .No # Nm zfs Cm rename Fl r Ar pool/users@today No @ Ns Ar yesterday .No # Nm zfs Cm snapshot Fl r Ar pool/users Ns @ Ns Ar today .Ed . .It Sy Example 16 : No Setting sharenfs Property Options on a ZFS File System The following commands show how to set .Sy sharenfs property options to enable read-write access for a set of IP addresses and to enable root access for system .Qq neo on the .Ar tank/home file system: -.Dl # Nm zfs Cm set Sy sharenfs Ns = Ns ' Ns Ar rw Ns =@123.123.0.0/16,root= Ns Ar neo Ns ' tank/home +.Dl # Nm zfs Cm set Sy sharenfs Ns = Ns ' Ns Ar rw Ns =@123.123.0.0/16:[::1],root= Ns Ar neo Ns ' tank/home .Pp If you are using DNS for host name resolution, specify the fully-qualified hostname. . .It Sy Example 17 : No Delegating ZFS Administration Permissions on a ZFS Dataset The following example shows how to set permissions so that user .Ar cindys can create, destroy, mount, and take snapshots on .Ar tank/cindys . The permissions on .Ar tank/cindys are also displayed. .Bd -literal -compact -offset Ds .No # Nm zfs Cm allow Sy cindys create , Ns Sy destroy , Ns Sy mount , Ns Sy snapshot Ar tank/cindys .No # Nm zfs Cm allow Ar tank/cindys ---- Permissions on tank/cindys -------------------------------------- Local+Descendent permissions: user cindys create,destroy,mount,snapshot .Ed .Pp Because the .Ar tank/cindys mount point permission is set to 755 by default, user .Ar cindys will be unable to mount file systems under .Ar tank/cindys . Add an ACE similar to the following syntax to provide mount point access: .Dl # Cm chmod No A+user: Ns Ar cindys Ns :add_subdirectory:allow Ar /tank/cindys . .It Sy Example 18 : No Delegating Create Time Permissions on a ZFS Dataset The following example shows how to grant anyone in the group .Ar staff to create file systems in .Ar tank/users . This syntax also allows staff members to destroy their own file systems, but not destroy anyone else's file system. The permissions on .Ar tank/users are also displayed. .Bd -literal -compact -offset Ds .No # Nm zfs Cm allow Ar staff Sy create , Ns Sy mount Ar tank/users .No # Nm zfs Cm allow Fl c Sy destroy Ar tank/users .No # Nm zfs Cm allow Ar tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: destroy Local+Descendent permissions: group staff create,mount .Ed . .It Sy Example 19 : No Defining and Granting a Permission Set on a ZFS Dataset The following example shows how to define and grant a permission set on the .Ar tank/users file system. The permissions on .Ar tank/users are also displayed. .Bd -literal -compact -offset Ds .No # Nm zfs Cm allow Fl s No @ Ns Ar pset Sy create , Ns Sy destroy , Ns Sy snapshot , Ns Sy mount Ar tank/users .No # Nm zfs Cm allow staff No @ Ns Ar pset tank/users .No # Nm zfs Cm allow Ar tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed . .It Sy Example 20 : No Delegating Property Permissions on a ZFS Dataset The following example shows to grant the ability to set quotas and reservations on the .Ar users/home file system. The permissions on .Ar users/home are also displayed. .Bd -literal -compact -offset Ds .No # Nm zfs Cm allow Ar cindys Sy quota , Ns Sy reservation Ar users/home .No # Nm zfs Cm allow Ar users/home ---- Permissions on users/home --------------------------------------- Local+Descendent permissions: user cindys quota,reservation cindys% zfs set quota=10G users/home/marks cindys% zfs get quota users/home/marks NAME PROPERTY VALUE SOURCE users/home/marks quota 10G local .Ed . .It Sy Example 21 : No Removing ZFS Delegated Permissions on a ZFS Dataset The following example shows how to remove the snapshot permission from the .Ar staff group on the .Sy tank/users file system. The permissions on .Sy tank/users are also displayed. .Bd -literal -compact -offset Ds .No # Nm zfs Cm unallow Ar staff Sy snapshot Ar tank/users .No # Nm zfs Cm allow Ar tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed . .It Sy Example 22 : No Showing the differences between a snapshot and a ZFS Dataset The following example shows how to see what has changed between a prior snapshot of a ZFS dataset and its current state. The .Fl F option is used to indicate type information for the files affected. .Bd -literal -compact -offset Ds .No # Nm zfs Cm diff Fl F Ar tank/test@before tank/test M / /tank/test/ M F /tank/test/linked (+1) R F /tank/test/oldname -> /tank/test/newname - F /tank/test/deleted + F /tank/test/created M F /tank/test/modified .Ed . .It Sy Example 23 : No Creating a bookmark The following example create a bookmark to a snapshot. This bookmark can then be used instead of snapshot in send streams. .Dl # Nm zfs Cm bookmark Ar rpool Ns @ Ns Ar snapshot rpool Ns # Ns Ar bookmark . .It Sy Example 24 : No Setting Sy sharesmb No Property Options on a ZFS File System The following example show how to share SMB filesystem through ZFS. Note that a user and their password must be given. .Dl # Nm smbmount Ar //127.0.0.1/share_tmp /mnt/tmp Fl o No user=workgroup/turbo,password=obrut,uid=1000 .Pp Minimal .Pa /etc/samba/smb.conf configuration is required, as follows. .Pp Samba will need to bind to the loopback interface for the ZFS utilities to communicate with Samba. This is the default behavior for most Linux distributions. .Pp Samba must be able to authenticate a user. This can be done in a number of ways .Pq Xr passwd 5 , LDAP , Xr smbpasswd 5 , &c.\& . How to do this is outside the scope of this document – refer to .Xr smb.conf 5 for more information. .Pp See the .Sx USERSHARES section for all configuration options, in case you need to modify any options of the share afterwards. Do note that any changes done with the .Xr net 8 command will be undone if the share is ever unshared (like via a reboot). .El . .Sh ENVIRONMENT VARIABLES .Bl -tag -width "ZFS_MOUNT_HELPER" .It Sy ZFS_MOUNT_HELPER Cause .Nm zfs Cm mount to use .Xr mount 8 to mount ZFS datasets. This option is provided for backwards compatibility with older ZFS versions. .El . .Sh INTERFACE STABILITY .Sy Committed . . .Sh SEE ALSO .Xr attr 1 , .Xr gzip 1 , .Xr ssh 1 , .Xr chmod 2 , .Xr fsync 2 , .Xr stat 2 , .Xr write 2 , .Xr acl 5 , .Xr attributes 5 , .Xr exports 5 , .Xr zfsconcepts 7 , .Xr zfsprops 7 , .Xr exportfs 8 , .Xr mount 8 , .Xr net 8 , .Xr selinux 8 , .Xr zfs-allow 8 , .Xr zfs-bookmark 8 , .Xr zfs-change-key 8 , .Xr zfs-clone 8 , .Xr zfs-create 8 , .Xr zfs-destroy 8 , .Xr zfs-diff 8 , .Xr zfs-get 8 , .Xr zfs-groupspace 8 , .Xr zfs-hold 8 , .Xr zfs-inherit 8 , .Xr zfs-jail 8 , .Xr zfs-list 8 , .Xr zfs-load-key 8 , .Xr zfs-mount 8 , .Xr zfs-program 8 , .Xr zfs-project 8 , .Xr zfs-projectspace 8 , .Xr zfs-promote 8 , .Xr zfs-receive 8 , .Xr zfs-redact 8 , .Xr zfs-release 8 , .Xr zfs-rename 8 , .Xr zfs-rollback 8 , .Xr zfs-send 8 , .Xr zfs-set 8 , .Xr zfs-share 8 , .Xr zfs-snapshot 8 , .Xr zfs-unallow 8 , .Xr zfs-unjail 8 , .Xr zfs-unload-key 8 , .Xr zfs-unmount 8 , .Xr zfs-unshare 8 , .Xr zfs-upgrade 8 , .Xr zfs-userspace 8 , .Xr zfs-wait 8 , .Xr zpool 8 diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in index 089b3ff88490..8538dd9d6997 100644 --- a/sys/contrib/openzfs/module/Makefile.in +++ b/sys/contrib/openzfs/module/Makefile.in @@ -1,140 +1,141 @@ include Kbuild INSTALL_MOD_DIR ?= extra +INSTALL_MOD_PATH ?= $(DESTDIR) SUBDIR_TARGETS = icp lua zstd all: modules distclean maintainer-clean: clean install: modules_install uninstall: modules_uninstall check: .PHONY: all distclean maintainer-clean install uninstall check distdir \ modules modules-Linux modules-FreeBSD modules-unknown \ clean clean-Linux clean-FreeBSD \ modules_install modules_install-Linux modules_install-FreeBSD \ modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \ cppcheck cppcheck-Linux cppcheck-FreeBSD # For FreeBSD, use debug options from ./configure if not overridden. export WITH_DEBUG ?= @WITH_DEBUG@ export WITH_INVARIANTS ?= @WITH_INVARIANTS@ # Filter out options that FreeBSD make doesn't understand getflags = ( \ set -- \ $(filter-out --%,$(firstword $(MFLAGS))) \ $(filter -I%,$(MFLAGS)) \ $(filter -j%,$(MFLAGS)); \ fmakeflags=""; \ while getopts :deiI:j:knqrstw flag; do \ case $$flag in \ \?) :;; \ :) if [ $$OPTARG = "j" ]; then \ ncpus=$$(sysctl -n kern.smp.cpus 2>/dev/null || :); \ if [ -n "$$ncpus" ]; then fmakeflags="$$fmakeflags -j$$ncpus"; fi; \ fi;; \ d) fmakeflags="$$fmakeflags -dA";; \ *) fmakeflags="$$fmakeflags -$$flag$$OPTARG";; \ esac; \ done; \ echo $$fmakeflags \ ) FMAKEFLAGS = -C @abs_srcdir@ -f Makefile.bsd $(shell $(getflags)) ifneq (@abs_srcdir@,@abs_builddir@) FMAKEFLAGS += MAKEOBJDIR=@abs_builddir@ endif FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) modules-Linux: list='$(SUBDIR_TARGETS)'; for targetdir in $$list; do \ $(MAKE) -C $$targetdir; \ done $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: +$(FMAKE) modules-unknown: @true modules: modules-@ac_system@ clean-Linux: @# Only cleanup the kernel build directories when CONFIG_KERNEL @# is defined. This indicates that kernel modules should be built. @CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ clean if [ -f @LINUX_SYMBOLS@ ]; then $(RM) @LINUX_SYMBOLS@; fi if [ -f Module.markers ]; then $(RM) Module.markers; fi find . -name '*.ur-safe' -type f -print | xargs $(RM) clean-FreeBSD: +$(FMAKE) clean clean: clean-@ac_system@ modules_install-Linux: @# Install the kernel modules $(MAKE) -C @LINUX_OBJ@ M=`pwd` modules_install \ - INSTALL_MOD_PATH=$(DESTDIR)$(INSTALL_MOD_PATH) \ + INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) \ INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \ KERNELRELEASE=@LINUX_VERSION@ @# Remove extraneous build products when packaging - kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ + kmoddir=$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ if [ -n "$(DESTDIR)" ]; then \ find $$kmoddir -name 'modules.*' | xargs $(RM); \ fi - sysmap=$(DESTDIR)$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \ + sysmap=$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \ if [ -f $$sysmap ]; then \ depmod -ae -F $$sysmap @LINUX_VERSION@; \ fi modules_install-FreeBSD: @# Install the kernel modules +$(FMAKE) install modules_install: modules_install-@ac_system@ modules_uninstall-Linux: @# Uninstall the kernel modules - kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ + kmoddir=$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ for objdir in $(ZFS_MODULES); do \ $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \ done modules_uninstall-FreeBSD: @false modules_uninstall: modules_uninstall-@ac_system@ cppcheck-Linux: @CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \ --inline-suppr --suppress=noValidConfiguration \ --enable=warning,information -D_KERNEL \ --include=@LINUX_OBJ@/include/generated/autoconf.h \ --include=@top_srcdir@/zfs_config.h \ --config-exclude=@LINUX_OBJ@/include \ -I @LINUX_OBJ@/include \ -I @top_srcdir@/include/os/linux/kernel \ -I @top_srcdir@/include/os/linux/spl \ -I @top_srcdir@/include/os/linux/zfs \ -I @top_srcdir@/include \ avl icp lua nvpair spl unicode zcommon zfs zstd os/linux cppcheck-FreeBSD: @true cppcheck: cppcheck-@ac_system@ distdir: (cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \ while read path; do \ mkdir -p $$distdir/$${path%/*}; \ cp @srcdir@/$$path $$distdir/$$path; \ done; \ cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c index 8bc1ef1325e9..fa1034ff88bc 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c @@ -1,508 +1,509 @@ /* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ /* * See abd.c for a general overview of the arc buffered data (ABD). * * Using a large proportion of scattered ABDs decreases ARC fragmentation since * when we are at the limit of allocatable space, using equal-size chunks will * allow us to quickly reclaim enough space for a new large allocation (assuming * it is also scattered). * * ABDs are allocated scattered by default unless the caller uses * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled. */ #include #include #include #include #include #include typedef struct abd_stats { kstat_named_t abdstat_struct_size; kstat_named_t abdstat_scatter_cnt; kstat_named_t abdstat_scatter_data_size; kstat_named_t abdstat_scatter_chunk_waste; kstat_named_t abdstat_linear_cnt; kstat_named_t abdstat_linear_data_size; } abd_stats_t; static abd_stats_t abd_stats = { /* Amount of memory occupied by all of the abd_t struct allocations */ { "struct_size", KSTAT_DATA_UINT64 }, /* * The number of scatter ABDs which are currently allocated, excluding * ABDs which don't own their data (for instance the ones which were * allocated through abd_get_offset()). */ { "scatter_cnt", KSTAT_DATA_UINT64 }, /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ { "scatter_data_size", KSTAT_DATA_UINT64 }, /* * The amount of space wasted at the end of the last chunk across all * scatter ABDs tracked by scatter_cnt. */ { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, /* * The number of linear ABDs which are currently allocated, excluding * ABDs which don't own their data (for instance the ones which were * allocated through abd_get_offset() and abd_get_from_buf()). If an * ABD takes ownership of its buf then it will become tracked. */ { "linear_cnt", KSTAT_DATA_UINT64 }, /* Amount of data stored in all linear ABDs tracked by linear_cnt */ { "linear_data_size", KSTAT_DATA_UINT64 }, }; struct { wmsum_t abdstat_struct_size; wmsum_t abdstat_scatter_cnt; wmsum_t abdstat_scatter_data_size; wmsum_t abdstat_scatter_chunk_waste; wmsum_t abdstat_linear_cnt; wmsum_t abdstat_linear_data_size; } abd_sums; /* * zfs_abd_scatter_min_size is the minimum allocation size to use scatter * ABD's for. Smaller allocations will use linear ABD's which use * zio_[data_]buf_alloc(). * * Scatter ABD's use at least one page each, so sub-page allocations waste * some space when allocated as scatter (e.g. 2KB scatter allocation wastes * half of each page). Using linear ABD's for small allocations means that * they will be put on slabs which contain many allocations. * * Linear ABDs for multi-page allocations are easier to use, and in some cases * it allows to avoid buffer copying. But allocation and especially free * of multi-page linear ABDs are expensive operations due to KVA mapping and * unmapping, and with time they cause KVA fragmentations. */ size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1; #if defined(_KERNEL) SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN, &zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations."); #endif kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; /* * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are * just a single zero'd page-sized buffer. This allows us to conserve * memory by only using a single zero buffer for the scatter chunks. */ abd_t *abd_zero_scatter = NULL; static char *abd_zero_buf = NULL; static uint_t abd_chunkcnt_for_bytes(size_t size) { return ((size + PAGE_MASK) >> PAGE_SHIFT); } static inline uint_t abd_scatter_chunkcnt(abd_t *abd) { ASSERT(!abd_is_linear(abd)); return (abd_chunkcnt_for_bytes( ABD_SCATTER(abd).abd_offset + abd->abd_size)); } boolean_t abd_size_alloc_linear(size_t size) { return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); } void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { uint_t n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = (n << PAGE_SHIFT) - abd->abd_size; if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); } else { ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); } } void abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) { ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); } else { ABDSTAT_BUMPDOWN(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); } } void abd_verify_scatter(abd_t *abd) { uint_t i, n; /* * There is no scatter linear pages in FreeBSD so there is * an error if the ABD has been marked as a linear page. */ ASSERT(!abd_is_linear_page(abd)); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, PAGE_SIZE); n = abd_scatter_chunkcnt(abd); for (i = 0; i < n; i++) { ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL); } } void abd_alloc_chunks(abd_t *abd, size_t size) { uint_t i, n; n = abd_chunkcnt_for_bytes(size); for (i = 0; i < n; i++) { ABD_SCATTER(abd).abd_chunks[i] = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); } } void abd_free_chunks(abd_t *abd) { uint_t i, n; n = abd_scatter_chunkcnt(abd); for (i = 0; i < n; i++) { kmem_cache_free(abd_chunk_cache, ABD_SCATTER(abd).abd_chunks[i]); } } abd_t * abd_alloc_struct_impl(size_t size) { uint_t chunkcnt = abd_chunkcnt_for_bytes(size); /* * In the event we are allocating a gang ABD, the size passed in * will be 0. We must make sure to set abd_size to the size of an * ABD struct as opposed to an ABD scatter with 0 chunks. The gang * ABD struct allocation accounts for an additional 24 bytes over * a scatter ABD with 0 chunks. */ size_t abd_size = MAX(sizeof (abd_t), offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); ASSERT3P(abd, !=, NULL); ABDSTAT_INCR(abdstat_struct_size, abd_size); return (abd); } void abd_free_struct_impl(abd_t *abd) { uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : abd_scatter_chunkcnt(abd); ssize_t size = MAX(sizeof (abd_t), offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); kmem_free(abd, size); ABDSTAT_INCR(abdstat_struct_size, -size); } /* * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where * each chunk in the scatterlist will be set to abd_zero_buf. */ static void abd_alloc_zero_scatter(void) { uint_t i, n; n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); abd_zero_buf = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + bzero(abd_zero_buf, PAGE_SIZE); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; ABD_SCATTER(abd_zero_scatter).abd_offset = 0; for (i = 0; i < n; i++) { ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = abd_zero_buf; } ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, PAGE_SIZE); } static void abd_free_zero_scatter(void) { ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGE_SIZE); abd_free_struct(abd_zero_scatter); abd_zero_scatter = NULL; kmem_cache_free(abd_chunk_cache, abd_zero_buf); } static int abd_kstats_update(kstat_t *ksp, int rw) { abd_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); as->abdstat_struct_size.value.ui64 = wmsum_value(&abd_sums.abdstat_struct_size); as->abdstat_scatter_cnt.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_cnt); as->abdstat_scatter_data_size.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_data_size); as->abdstat_scatter_chunk_waste.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); as->abdstat_linear_cnt.value.ui64 = wmsum_value(&abd_sums.abdstat_linear_cnt); as->abdstat_linear_data_size.value.ui64 = wmsum_value(&abd_sums.abdstat_linear_data_size); return (0); } void abd_init(void) { abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0, NULL, NULL, NULL, NULL, 0, KMC_NODEBUG); wmsum_init(&abd_sums.abdstat_struct_size, 0); wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); wmsum_init(&abd_sums.abdstat_linear_cnt, 0); wmsum_init(&abd_sums.abdstat_linear_data_size, 0); abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { abd_ksp->ks_data = &abd_stats; abd_ksp->ks_update = abd_kstats_update; kstat_install(abd_ksp); } abd_alloc_zero_scatter(); } void abd_fini(void) { abd_free_zero_scatter(); if (abd_ksp != NULL) { kstat_delete(abd_ksp); abd_ksp = NULL; } wmsum_fini(&abd_sums.abdstat_struct_size); wmsum_fini(&abd_sums.abdstat_scatter_cnt); wmsum_fini(&abd_sums.abdstat_scatter_data_size); wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); wmsum_fini(&abd_sums.abdstat_linear_cnt); wmsum_fini(&abd_sums.abdstat_linear_data_size); kmem_cache_destroy(abd_chunk_cache); abd_chunk_cache = NULL; } void abd_free_linear_page(abd_t *abd) { /* * FreeBSD does not have scatter linear pages * so there is an error. */ VERIFY(0); } /* * If we're going to use this ABD for doing I/O using the block layer, the * consumer of the ABD data doesn't care if it's scattered or not, and we don't * plan to store this ABD in memory for a long period of time, we should * allocate the ABD type that requires the least data copying to do the I/O. * * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os * using a scatter/gather list we should switch to that and replace this call * with vanilla abd_alloc(). */ abd_t * abd_alloc_for_io(size_t size, boolean_t is_metadata) { return (abd_alloc_linear(size, is_metadata)); } abd_t * abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) { abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; size_t chunkcnt = abd_chunkcnt_for_bytes( (new_offset & PAGE_MASK) + size); ASSERT3U(chunkcnt, <=, abd_scatter_chunkcnt(sabd)); /* * If an abd struct is provided, it is only the minimum size. If we * need additional chunks, we need to allocate a new struct. */ if (abd != NULL && offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) > sizeof (abd_t)) { abd = NULL; } if (abd == NULL) abd = abd_alloc_struct(chunkcnt << PAGE_SHIFT); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK; /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&ABD_SCATTER(abd).abd_chunks, &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT], chunkcnt * sizeof (void *)); return (abd); } /* * Initialize the abd_iter. */ void abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); aiter->iter_abd = abd; aiter->iter_pos = 0; aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } /* * This is just a helper function to see if we have exhausted the * abd_iter and reached the end. */ boolean_t abd_iter_at_end(struct abd_iter *aiter) { return (aiter->iter_pos == aiter->iter_abd->abd_size); } /* * Advance the iterator by a certain amount. Cannot be called when a chunk is * in use. This can be safely called when the aiter has already exhausted, in * which case this does nothing. */ void abd_iter_advance(struct abd_iter *aiter, size_t amount) { ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) return; aiter->iter_pos += amount; } /* * Map the current chunk into aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ void abd_iter_map(struct abd_iter *aiter) { void *paddr; ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ if (abd_iter_at_end(aiter)) return; abd_t *abd = aiter->iter_abd; size_t offset = aiter->iter_pos; if (abd_is_linear(abd)) { aiter->iter_mapsize = abd->abd_size - offset; paddr = ABD_LINEAR_BUF(abd); } else { offset += ABD_SCATTER(abd).abd_offset; paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT]; offset &= PAGE_MASK; aiter->iter_mapsize = MIN(PAGE_SIZE - offset, abd->abd_size - aiter->iter_pos); } aiter->iter_mapaddr = (char *)paddr + offset; } /* * Unmap the current chunk from aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ void abd_iter_unmap(struct abd_iter *aiter) { if (!abd_iter_at_end(aiter)) { ASSERT3P(aiter->iter_mapaddr, !=, NULL); ASSERT3U(aiter->iter_mapsize, >, 0); } aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } void abd_cache_reap_now(void) { kmem_cache_reap_soon(abd_chunk_cache); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c index 2cf54a3cef65..38488dbda6f4 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c @@ -1,352 +1,333 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef IDX_TO_OFF #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #endif #if __FreeBSD_version < 1300051 #define VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY #else #define VM_ALLOC_BUSY_FLAGS VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY #endif #if __FreeBSD_version < 1300072 #define dmu_page_lock(m) vm_page_lock(m) #define dmu_page_unlock(m) vm_page_unlock(m) #else #define dmu_page_lock(m) #define dmu_page_unlock(m) #endif -static int -dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); - - dnode_rele(dn, FTAG); - - return (err); -} - int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, vm_page_t *ma, dmu_tx_t *tx) { dmu_buf_t **dbp; struct sf_buf *sf; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT3U(size, >, 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(*ma, &sf); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(sf); ma += 1; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, int last_size) { struct sf_buf *sf; vm_object_t vmobj; vm_page_t m; dmu_buf_t **dbp; dmu_buf_t *db; caddr_t va; int numbufs, i; int bufoff, pgoff, tocpy; int mi, di; int err; ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); ASSERT3S(last_size, <=, PAGE_SIZE); err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); if (err != 0) return (err); #ifdef ZFS_DEBUG IMPLY(last_size < PAGE_SIZE, *rahead == 0); if (dbp[0]->db_offset != 0 || numbufs > 1) { for (i = 0; i < numbufs; i++) { ASSERT(ISP2(dbp[i]->db_size)); ASSERT3U((dbp[i]->db_offset % dbp[i]->db_size), ==, 0); ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); } } #endif vmobj = ma[0]->object; zfs_vmobject_wlock_12(vmobj); db = dbp[0]; for (i = 0; i < *rbehind; i++) { m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i, VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); if (m == NULL) break; if (!vm_page_none_valid(m)) { ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); vm_page_do_sunbusy(m); break; } ASSERT3U(m->dirty, ==, 0); ASSERT(!pmap_page_is_write_mapped(m)); ASSERT3U(db->db_size, >, PAGE_SIZE); bufoff = IDX_TO_OFF(m->pindex) % db->db_size; va = zfs_map_page(m, &sf); bcopy((char *)db->db_data + bufoff, va, PAGESIZE); zfs_unmap_page(sf); vm_page_valid(m); dmu_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); dmu_page_unlock(m); vm_page_do_sunbusy(m); } *rbehind = i; bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; pgoff = 0; for (mi = 0, di = 0; mi < count && di < numbufs; ) { if (pgoff == 0) { m = ma[mi]; if (m != bogus_page) { vm_page_assert_xbusied(m); ASSERT(vm_page_none_valid(m)); ASSERT3U(m->dirty, ==, 0); ASSERT(!pmap_page_is_write_mapped(m)); va = zfs_map_page(m, &sf); } } if (bufoff == 0) db = dbp[di]; if (m != bogus_page) { ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, db->db_offset + bufoff); } /* * We do not need to clamp the copy size by the file * size as the last block is zero-filled beyond the * end of file anyway. */ tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); ASSERT3S(tocpy, >=, 0); if (m != bogus_page) bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); pgoff += tocpy; ASSERT3S(pgoff, >=, 0); ASSERT3S(pgoff, <=, PAGESIZE); if (pgoff == PAGESIZE) { if (m != bogus_page) { zfs_unmap_page(sf); vm_page_valid(m); } ASSERT3S(mi, <, count); mi++; pgoff = 0; } bufoff += tocpy; ASSERT3S(bufoff, >=, 0); ASSERT3S(bufoff, <=, db->db_size); if (bufoff == db->db_size) { ASSERT3S(di, <, numbufs); di++; bufoff = 0; } } #ifdef ZFS_DEBUG /* * Three possibilities: * - last requested page ends at a buffer boundary and , thus, * all pages and buffers have been iterated; * - all requested pages are filled, but the last buffer * has not been exhausted; * the read-ahead is possible only in this case; * - all buffers have been read, but the last page has not been * fully filled; * this is only possible if the file has only a single buffer * with a size that is not a multiple of the page size. */ if (mi == count) { ASSERT3S(di, >=, numbufs - 1); IMPLY(*rahead != 0, di == numbufs - 1); IMPLY(*rahead != 0, bufoff != 0); ASSERT0(pgoff); } if (di == numbufs) { ASSERT3S(mi, >=, count - 1); ASSERT0(*rahead); IMPLY(pgoff == 0, mi == count); if (pgoff != 0) { ASSERT3S(mi, ==, count - 1); ASSERT3U((dbp[0]->db_size & PAGE_MASK), !=, 0); } } #endif if (pgoff != 0) { ASSERT3P(m, !=, bogus_page); bzero(va + pgoff, PAGESIZE - pgoff); zfs_unmap_page(sf); vm_page_valid(m); } for (i = 0; i < *rahead; i++) { m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i, VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); if (m == NULL) break; if (!vm_page_none_valid(m)) { ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); vm_page_do_sunbusy(m); break; } ASSERT3U(m->dirty, ==, 0); ASSERT(!pmap_page_is_write_mapped(m)); ASSERT3U(db->db_size, >, PAGE_SIZE); bufoff = IDX_TO_OFF(m->pindex) % db->db_size; tocpy = MIN(db->db_size - bufoff, PAGESIZE); va = zfs_map_page(m, &sf); bcopy((char *)db->db_data + bufoff, va, tocpy); if (tocpy < PAGESIZE) { ASSERT3S(i, ==, *rahead - 1); ASSERT3U((db->db_size & PAGE_MASK), !=, 0); bzero(va + tocpy, PAGESIZE - tocpy); } zfs_unmap_page(sf); vm_page_valid(m); dmu_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); dmu_page_unlock(m); vm_page_do_sunbusy(m); } *rahead = i; zfs_vmobject_wunlock_12(vmobj); dmu_buf_rele_array(dbp, numbufs, FTAG); return (0); } diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 6acd3631348e..f0330150f986 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1,11116 +1,11121 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * arc_meta_limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed and the arc_meta_limit honored. For example, * when using the ZPL each dentry holds a references on a znode. These * dentries must be pruned before the arc buffer holding the znode can * be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. * * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a * performance hit. If the caller requests encrypted data, however, we must be * sure they actually get it or else secret information could be leaked. Raw * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, * may have both an encrypted version and a decrypted version of its data at * once. When a caller needs a raw arc_buf_t, it is allocated and the data is * copied out of this header. To avoid complications with b_pabd, raw buffers * cannot be shared. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif /* * This thread's job is to keep enough free memory in the system, by * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; /* * Count of bytes evicted since boot. */ static uint64_t arc_evict_count; /* * List of arc_evict_waiter_t's, representing threads waiting for the * arc_evict_count to reach specific values. */ static list_t arc_evict_waiters; /* * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of * the requested amount of data to be evicted. For example, by default for * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. * Since this is above 100%, it ensures that progress is made towards getting * arc_size under arc_c. Since this is finite, it ensures that allocations * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ int zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ int arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ int arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL uint_t zfs_arc_pc_percent = 0; #endif /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ int arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_ms; static int arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ int arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. */ boolean_t arc_warm; /* * These tunables are for performance analysis. */ unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; unsigned long zfs_arc_meta_min = 0; unsigned long zfs_arc_dnode_limit = 0; unsigned long zfs_arc_dnode_reduce_percent = 10; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * ARC dirty data constraints for arc_tempreserve_space() throttle. */ unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ /* * Enable or disable compressed arc buffers. */ int zfs_compressed_arc_enabled = B_TRUE; /* * ARC will evict meta buffers that exceed arc_meta_limit. This * tunable make arc_meta_limit adjustable for different workloads. */ unsigned long zfs_arc_meta_limit_percent = 75; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ unsigned long zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux specific */ unsigned long zfs_arc_sys_free = 0; int zfs_arc_min_prefetch_ms = 0; int zfs_arc_min_prescient_prefetch_ms = 0; int zfs_arc_p_dampener_disable = 1; int zfs_arc_meta_prune = 10000; int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; int zfs_arc_meta_adjust_restarts = 4096; int zfs_arc_lotsfree_percent = 10; /* The 6 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, #if defined(COMPAT_FREEBSD11) { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, { "l2_mru_asize", KSTAT_DATA_UINT64 }, { "l2_mfu_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "memory_all_bytes", KSTAT_DATA_UINT64 }, { "memory_free_bytes", KSTAT_DATA_UINT64 }, { "memory_available_bytes", KSTAT_DATA_INT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, { "cached_only_in_progress", KSTAT_DATA_UINT64 }, { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; arc_sums_t arc_sums; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } /* * This macro allows us to use kstats as floating averages. Each time we * update this kstat, we first factor it and the update value by * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall * average. This macro assumes that integer loads and stores are atomic, but * is not safe for multiple writers updating the kstat in parallel (only the * last writer's update will remain). */ #define ARCSTAT_F_AVG_FACTOR 3 #define ARCSTAT_F_AVG(stat, value) \ do { \ uint64_t x = ARCSTAT(stat); \ x = x - x / ARCSTAT_F_AVG_FACTOR + \ (value) / ARCSTAT_F_AVG_FACTOR; \ ARCSTAT(stat) = x; \ } while (0) kstat_t *arc_ksp; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ /* max size for dnodes */ #define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; list_t arc_prune_list; kmutex_t arc_prune_mtx; taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) #define HDR_HAS_RABD(hdr) \ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ (hdr)->b_crypt_hdr.b_rabd != NULL) #define HDR_ENCRYPTED(hdr) \ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) #define HDR_AUTHENTICATED(hdr) \ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ int l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; typedef enum arc_ovf_level { ARC_OVF_NONE, /* ARC within target size. */ ARC_OVF_SOME, /* ARC is slightly overflowed. */ ARC_OVF_SEVERE /* ARC is severely overflowed. */ } arc_ovf_level_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static kmutex_t l2arc_rebuild_thr_lock; static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_DO_ADAPT = 0x2, ARC_HDR_USE_RESERVE = 0x4, }; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) #define l2arc_hdr_arcstats_increment_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. */ int l2arc_mfuonly = 0; /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we * have filled the device. It is defined as a percentage of the * write size. If set to 100 we trim twice the space required to * accommodate upcoming writes. A minimum of 64MB will be trimmed. * It also enables TRIM of the whole L2ARC device upon creation or * addition to an existing pool or if the header of the device is * invalid upon importing a pool or onlining a cache device. The * default is 0, which disables TRIM on L2ARC altogether as it can * put significant stress on the underlying storage devices. This * will vary depending of how well the specific device handles * these commands. */ unsigned long l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding * an L2ARC device (either at pool import or later) will attempt * to rebuild L2ARC buffer contents. * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls * whether log blocks are written to the L2ARC device. If the L2ARC * device is less than 1GB, the amount of data l2arc_evict() * evicts is significant compared to the amount of restored L2ARC * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ int l2arc_rebuild_enabled = B_TRUE; unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); static void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev); static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io); static zio_t *l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); static void l2arc_blk_fetch_done(zio_t *zio); static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EMPTY_OR_LOCKED(hdr) \ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } uint64_t he = atomic_inc_64_nv( &arc_stats.arcstat_hash_elements.value.ui64); ARCSTAT_MAX(arcstat_hash_elements_max, he); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; hdr_full_cons(vbuf, unused, kmflag); bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr)); arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_full_crypt_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; hdr_full_dest(vbuf, unused); arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr __maybe_unused = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * This function will return B_TRUE if the buffer is encrypted in memory. * This buffer can be decrypted by calling arc_untransform(). */ boolean_t arc_is_encrypted(arc_buf_t *buf) { return (ARC_BUF_ENCRYPTED(buf) != 0); } /* * Returns B_TRUE if the buffer represents data that has not had its MAC * verified yet. */ boolean_t arc_is_unauthenticated(arc_buf_t *buf) { return (HDR_NOAUTH(buf->b_hdr) != 0); } void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_PROTECTED(hdr)); bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } /* * Indicates how this buffer is compressed in memory. If it is not compressed * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with * arc_untransform() as long as it is also unencrypted. */ enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } /* * Return the compression algorithm used to store this data in the ARC. If ARC * compression is enabled or this is an encrypted block, this will be the same * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. */ static inline enum zio_compress arc_hdr_get_compress(arc_buf_hdr_t *hdr) { return (HDR_COMPRESSION_ENABLED(hdr) ? HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } uint8_t arc_get_complevel(arc_buf_t *buf) { return (buf->b_hdr->b_complevel); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * Return true iff at least one of the bufs on hdr is not compressed. * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * This function makes the assumption that data stored in the L2ARC * will be transformed exactly as it is in the main pool. Because of * this we can verify the checksum against the reading process's bp. */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum. */ if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); arc_cksum_compute(buf); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } /* * Allocates an ARC buf header that's in an evicted & L2-cached state. * This is used during l2arc reconstruction to make empty ARC buffers * which circumvent the regular disk->arc->l2arc path and instead come * into being in the reverse order, i.e. l2arc->arc. */ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; ASSERT(size != 0); hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); hdr->b_dva = dva; hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } static int arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) { int ret; uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. * However, if compressed arc is disabled we will only have the * decompressed data available to us now. Compress it into a temporary * abd so we can verify the MAC. The performance overhead of this will * be relatively low, since most objects in an encrypted objset will * be encrypted (instead of authenticated) anyway. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { tmpbuf = zio_buf_alloc(lsize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); csize = zio_compress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel); ASSERT3U(csize, <=, psize); abd_zero_off(abd, csize, psize - csize); } /* * Authentication is best effort. We authenticate whenever the key is * available. If we succeed we clear ARC_FLAG_NOAUTH. */ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); ASSERT3U(lsize, ==, psize); ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); } else { ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_crypt_hdr.b_mac); } if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); else if (ret != ENOENT) goto error; if (tmpbuf != NULL) abd_free(abd); return (0); error: if (tmpbuf != NULL) abd_free(abd); return (ret); } /* * This function will take a header that only has raw encrypted data in * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in * b_l1hdr.b_pabd. If designated in the header flags, this function will * also decompress the data. */ static int arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; void *tmp = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) { abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); } /* * If this header has disabled arc compression but the b_pabd is * compressed after decrypting it, we need to decompress the newly * decrypted data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * We want to make sure that we are correctly honoring the * zfs_abd_scatter_enabled setting, so we allocate an abd here * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_DO_ADAPT); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; } return (0); error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } /* * This function is called during arc_buf_fill() to prepare the header's * abd plaintext pointer for use. This involves authenticated protected * data and decrypting encrypted data into the plaintext abd. */ static int arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, const zbookmark_phys_t *zb, boolean_t noauth) { int ret; ASSERT(HDR_PROTECTED(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); if (HDR_NOAUTH(hdr) && !noauth) { /* * The caller requested authenticated data but our data has * not been authenticated yet. Verify the MAC now if we can. */ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); if (ret != 0) goto error; } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { /* * If we only have the encrypted version of the data, but the * unencrypted version was requested we take this opportunity * to store the decrypted version in the header for future use. */ ret = arc_hdr_decrypt(hdr, spa, zb); if (ret != 0) goto error; } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); if (hash_lock != NULL) mutex_exit(hash_lock); return (0); error: if (hash_lock != NULL) mutex_exit(hash_lock); return (ret); } /* * This function is used by the dbuf code to decrypt bonus buffers in place. * The dbuf code itself doesn't have any locking for decrypting a shared dnode * block, so we use the hash lock here to protect against concurrent calls to * arc_buf_fill(). */ static void arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, arc_fill_flags_t flags) { int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, !ARC_BUF_SHARED(buf)); /* * If the caller wanted encrypted data we just need to copy it from * b_rabd and potentially byteswap it. We won't be able to do any * further transforms on it. */ if (encrypted) { ASSERT(HDR_HAS_RABD(hdr)); abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); goto byteswap; } /* * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. */ if (HDR_PROTECTED(hdr)) { error = arc_fill_hdr_crypt(hdr, hash_lock, spa, zb, !!(flags & ARC_FILL_NOAUTH)); if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { return (error); } else if (error != 0) { if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (error); } } /* * There is a special case here for dnode blocks which are * decrypting their bonus buffers. These blocks may request to * be decrypted in-place. This is necessary because there may * be many dnodes pointing into this buffer and there is * currently no method to synchronize replacing the backing * b_data buffer and updating all of the pointers. Here we use * the hash lock to ensure there are no races. If the need * arises for other types to be decrypted in-place, they must * add handling here as well. */ if ((flags & ARC_FILL_IN_PLACE) != 0) { ASSERT(!hdr_compressed); ASSERT(!compressed); ASSERT(!encrypted); if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); if (hash_lock != NULL) mutex_enter(hash_lock); arc_buf_untransform_in_place(buf, hash_lock); if (hash_lock != NULL) mutex_exit(hash_lock); /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); } return (0); } if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %px, compress %d, psize %d, lsize %d", hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } /* * If this function is being called to decrypt an encrypted buffer or verify an * authenticated one, the key must be loaded and a mapping must be made * available in the keystore via spa_keystore_create_mapping() or one of its * callers. */ int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place) { int ret; arc_fill_flags_t flags = 0; if (in_place) flags |= ARC_FILL_IN_PLACE; ret = arc_buf_fill(buf, spa, zb, flags); if (ret == ECKSUM) { /* * Convert authentication and decryption errors to EIO * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); spa_log_error(spa, zb); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } return (ret); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { arc_state_t *state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } state = hdr->b_l1hdr.b_state; if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); /* * arc_l2c_only counts as a ghost state so we don't need to explicitly * check to prevent usage of the arc_l2c_only list. */ if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { abi->abi_bufcnt = l1hdr->b_bufcnt; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *old_state; int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); } else { old_state = arc_l2c_only; refcnt = 0; bufcnt = 0; update_old = B_FALSE; } update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_new = B_TRUE; } arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) zfs_refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( &new_state->arcs_size, arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( &new_state->arcs_size, HDR_GET_PSIZE(hdr), hdr); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) zfs_refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( &old_state->arcs_size, HDR_GET_PSIZE(hdr), hdr); } } } if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { l2arc_hdr_arcstats_decrement_state(hdr); hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; l2arc_hdr_arcstats_increment_state(hdr); } } } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, space); break; case ARC_SPACE_ABD_CHUNK_WASTE: /* * Note: this includes space wasted by all scatter ABD's, not * just those allocated by the ARC. But the vast majority of * scatter ABD's come from the ARC, because other users are * very short-lived. */ ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) aggsum_add(&arc_sums.arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space); break; case ARC_SPACE_ABD_CHUNK_WASTE: ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used, space) >= 0); ARCSTAT_MAX(arcstat_meta_max, aggsum_upper_bound(&arc_sums.arcstat_meta_used)); aggsum_add(&arc_sums.arcstat_meta_used, -space); } ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the buffer is not encrypted * 2. the hdr's compression matches the buf's compression * 3. the hdr doesn't need to be byteswapped * 4. the hdr isn't already being shared * 5. the buf is either compressed or it is the last buf in the hdr list * * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (!ARC_BUF_ENCRYPTED(buf) && buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; } else if (compressed && arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; flags |= ARC_FILL_COMPRESSED; } if (noauth) { ASSERT0(encrypted); flags |= ARC_FILL_NOAUTH; } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. It must be allocated via * zio_[data_]buf_alloc(), not as a page, because we need to be able * to abd_release_ownership_of_buf(), which isn't allowed on "linear * page" buffers because the ABD code needs to handle freeing them * specially. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd) && !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; if (encrypted) hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { ASSERT3P(zb, !=, NULL); return (arc_buf_fill(buf, spa, zb, flags)); } return (0); } static char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, byteorder, salt, iv, mac, ot, psize, lsize, compression_type, complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } if (free_rdata) { l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); } else { l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) { hdr->b_crypt_hdr.b_ebufcnt -= 1; /* * If we have no more encrypted buffers and we've * already gotten a copy of the decrypted data we can * free b_rabd to save some space. */ if (hdr->b_crypt_hdr.b_ebufcnt == 0 && HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { arc_hdr_free_abd(hdr, B_TRUE); } } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. We must also be careful not to share * encrypted buffers, since they cannot be shared. */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else if (free_rdata) { arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } if (free_rdata) { hdr->b_crypt_hdr.b_rabd = NULL; ARCSTAT_INCR(arcstat_raw_size, -size); } else { hdr->b_l1hdr.b_pabd = NULL; } if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } /* * Allocate empty anonymous ARC header. The header will get its identity * assigned and buffers attached later as part of read or write operations. * * In case of read arc_read() assigns header its identify (b_dva + b_birth), * inserts it into ARC hash to become globally visible and allocates physical * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read * completion arc_read_done() allocates ARC buffer(s) as needed, potentially * sharing one of them with the physical ABD buffer. * * In case of write arc_alloc_buf() allocates ARC buffer to be filled with * data. Then after compression and/or encryption arc_write_ready() allocates * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD * buffer. On disk write completion arc_write_done() assigns the header its * new identity (b_dva + b_birth) and inserts into ARC hash. * * In case of partial overwrite the old data is read first as described. Then * arc_release() either allocates new anonymous ARC header and moves the ARC * buffer to it, or reuses the old ARC header by discarding its identity and * removing it from ARC hash. After buffer modification normal write process * follows as described. */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); /* * if the caller wanted a new full header and the header is to be * encrypted we will actually allocate the header from the full crypt * cache instead. The same applies to freeing from the old cache. */ if (HDR_PROTECTED(hdr) && new == hdr_full_cache) new = hdr_full_crypt_cache; if (HDR_PROTECTED(hdr) && old == hdr_full_cache) old = hdr_full_crypt_cache; nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache || new == hdr_full_crypt_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * This function allows an L1 header to be reallocated as a crypt * header and vice versa. If we are going to a crypt header, the * new fields will be zeroed out. */ static arc_buf_hdr_t * arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) { arc_buf_hdr_t *nhdr; arc_buf_t *buf; kmem_cache_t *ncache, *ocache; /* * This function requires that hdr is in the arc_anon state. * Therefore it won't have any L2ARC data for us to worry * about copying. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); ASSERT3P(hdr->b_hash_next, ==, NULL); if (need_crypt) { ncache = hdr_full_crypt_cache; ocache = hdr_full_cache; } else { ncache = hdr_full_cache; ocache = hdr_full_crypt_cache; } nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); /* * Copy all members that aren't locks or condvars to the new header. * No lists are pointing to us (as we asserted above), so we don't * need to worry about the list nodes. */ nhdr->b_dva = hdr->b_dva; nhdr->b_birth = hdr->b_birth; nhdr->b_type = hdr->b_type; nhdr->b_flags = hdr->b_flags; nhdr->b_psize = hdr->b_psize; nhdr->b_lsize = hdr->b_lsize; nhdr->b_spa = hdr->b_spa; nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; /* * This zfs_refcount_add() exists only to ensure that the individual * arc buffers always point to a header that is referenced, avoiding * a small race condition that could trigger ASSERTs. */ (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { mutex_enter(&buf->b_evict_lock); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); } zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); if (need_crypt) { arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); } else { arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); } /* unset all members of the original hdr */ bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_type = ARC_BUFC_INVALID; hdr->b_flags = 0; hdr->b_psize = 0; hdr->b_lsize = 0; hdr->b_spa = 0; hdr->b_l1hdr.b_freeze_cksum = NULL; hdr->b_l1hdr.b_buf = NULL; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_byteswap = 0; hdr->b_l1hdr.b_state = NULL; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_acb = NULL; hdr->b_l1hdr.b_pabd = NULL; if (ocache == hdr_full_crypt_cache) { ASSERT(!HDR_HAS_RABD(hdr)); hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; hdr->b_crypt_hdr.b_ebufcnt = 0; hdr->b_crypt_hdr.b_dsobj = 0; bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); } buf_discard_identity(hdr); kmem_cache_free(ocache, hdr); return (nhdr); } /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); if (!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); if (salt != NULL) bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); if (iv != NULL) bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); if (mac != NULL) bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * To ensure that the hdr has the correct data in it if we call * arc_untransform() on this buf before it's been written to disk, * it's easiest if we just set up sharing between the buf and the hdr. */ arc_share_buf(hdr, buf); return (buf); } arc_buf_t * arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? ARC_BUFC_METADATA : ARC_BUFC_DATA; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an * encrypted type. It will become authenticated instead in * arc_write_ready(). */ buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); return (buf); } static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); arc_buf_contents_t type = hdr->b_type; int64_t lsize_s; int64_t psize_s; int64_t asize_s; if (incr) { lsize_s = lsize; psize_s = psize; asize_s = asize; } else { lsize_s = -lsize; psize_s = -psize; asize_s = -asize; } /* If the buffer is a prefetch, count it as such. */ if (HDR_PREFETCH(hdr)) { ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); } else { /* * We use the value stored in the L2 header upon initial * caching in L2ARC. This value will be updated in case * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC * metadata (log entry) cannot currently be updated. Having * the ARC state in the L2 header solves the problem of a * possibly absent L1 header (apparent in buffers restored * from persistent L2ARC). */ switch (hdr->b_l2hdr.b_arcs_state) { case ARC_STATE_MRU_GHOST: case ARC_STATE_MRU: ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); break; case ARC_STATE_MFU_GHOST: case ARC_STATE_MFU: ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); break; default: break; } } if (state_only) return; ARCSTAT_INCR(arcstat_l2_psize, psize_s); ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); switch (type) { case ARC_BUFC_DATA: ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); break; case ARC_BUFC_METADATA: ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); break; default: break; } } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ - if (HDR_HAS_L2HDR(hdr)) + if (HDR_HAS_L2HDR(hdr)) { + + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + arc_hdr_l2hdr_destroy(hdr); + } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } /* * The header's identify can only be safely discarded once it is no * longer discoverable. This requires removing it from the hash table * and the l2arc header list. After this point the hash lock can not * be used to protect the header. */ if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (!HDR_PROTECTED(hdr)) { kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_full_crypt_cache, hdr); } } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, NULL, tag)); arc_hdr_destroy(hdr); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction * progress at the same (or at least comparable) rate as from non-ghost states. * * Return *real_evicted for actual ARC size reduction to wake up threads * waiting for it. For non-ghost states it includes size of evicted data * buffers (the headers are not freed there). For ghost states it includes * only the evicted headers size. */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < MSEC_TO_TICK(min_lifetime))) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { ARCSTAT_BUMP(arcstat_mutex_miss); break; } if (buf->b_data != NULL) { bytes_evicted += HDR_GET_LSIZE(hdr); *real_evicted += HDR_GET_LSIZE(hdr); } mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); switch (state->arcs_state) { case ARC_STATE_MRU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mru, HDR_GET_LSIZE(hdr)); break; case ARC_STATE_MFU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mfu, HDR_GET_LSIZE(hdr)); break; default: break; } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } if (hdr->b_l1hdr.b_bufcnt == 0) { arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly * in arc_free_data_impl(). */ if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); } static void arc_set_need_free(void) { ASSERT(MUTEX_HELD(&arc_evict_lock)); int64_t remaining = arc_free_memory() - arc_sys_free / 2; arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); if (aw == NULL) { arc_need_free = MAX(-remaining, 0); } else { arc_need_free = MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); } } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; int evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { if ((evict_count <= 0) || (bytes_evicted >= bytes)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t revicted; uint64_t evicted = arc_evict_hdr(hdr, hash_lock, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count--; } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is * ordered by ascending aew_count, we pop off the beginning of the * list until we reach the end, or a waiter that's past the current * "count". Doing this outside the loop reduces the number of times * we need to acquire the global arc_evict_lock. * * Only wake when there's sufficient free memory in the system * (specifically, arc_sys_free/2, which by default is a bit more than * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); arc_evict_count += real_evicted; if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); cv_broadcast(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to * this CPU are able to make progress, make a voluntary preemption * call here. */ cond_resched(); return (bytes_evicted); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_evict_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Try to reduce pinned dnodes with a floor of arc_dnode_limit. * Request that 10% of the LRUs be scanned by the superblock * shrinker. */ if (type == ARC_BUFC_DATA && aggsum_compare( &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) { arc_prune_async((aggsum_upper_bound( &arc_sums.arcstat_dnode_size) - arc_dnode_size_limit) / sizeof (dnode_t) / zfs_arc_dnode_reduce_percent); } /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); kmem_cache_free(hdr_full_cache, markers[i]); } kmem_free(markers, sizeof (*markers) * num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function * prevents us from trying to evict more from a state's list than * is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } return (0); } /* * The goal of this function is to evict enough meta data buffers from the * ARC in order to enforce the arc_meta_limit. Achieving this is slightly * more complicated than it appears because it is common for data buffers * to have holds on meta data buffers. In addition, dnode meta data buffers * will be held by the dnodes in the block preventing them from being freed. * This means we can't simply traverse the ARC and expect to always find * enough unheld meta data buffer to release. * * Therefore, this function has been updated to make alternating passes * over the ARC releasing data buffers and then newly unheld meta data * buffers. This ensures forward progress is maintained and meta_used * will decrease. Normally this is sufficient, but if required the ARC * will call the registered prune callbacks causing dentry and inodes to * be dropped from the VFS cache. This will make dnode meta data buffers * available for reclaim. */ static uint64_t arc_evict_meta_balanced(uint64_t meta_used) { int64_t delta, prune = 0, adjustmnt; uint64_t total_evicted = 0; arc_buf_contents_t type = ARC_BUFC_DATA; int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); restart: /* * This slightly differs than the way we evict from the mru in * arc_evict because we don't have a "target" value (i.e. no * "meta" arc_p). As a result, I think we can completely * cannibalize the metadata in the MRU before we evict the * metadata from the MFU. I think we probably need to implement a * "metadata arc_p" value to do this properly. */ adjustmnt = meta_used - arc_meta_limit; if (adjustmnt > 0 && zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), adjustmnt); total_evicted += arc_evict_impl(arc_mru, 0, delta, type); adjustmnt -= delta; } /* * We can't afford to recalculate adjustmnt here. If we do, * new metadata buffers can sneak into the MRU or ANON lists, * thus penalize the MFU metadata. Although the fudge factor is * small, it has been empirically shown to be significant for * certain workloads (e.g. creating many empty directories). As * such, we use the original calculation for adjustmnt, and * simply decrement the amount of data evicted from the MRU. */ if (adjustmnt > 0 && zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), adjustmnt); total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); } adjustmnt = meta_used - arc_meta_limit; if (adjustmnt > 0 && zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); adjustmnt -= delta; } if (adjustmnt > 0 && zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); } /* * If after attempting to make the requested adjustment to the ARC * the meta limit is still being exceeded then request that the * higher layers drop some cached objects which have holds on ARC * meta buffers. Requests to the upper layers will be made with * increasingly large scan sizes until the ARC is below the limit. */ if (meta_used > arc_meta_limit) { if (type == ARC_BUFC_DATA) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; if (zfs_arc_meta_prune) { prune += zfs_arc_meta_prune; arc_prune_async(prune); } } if (restarts > 0) { restarts--; goto restart; } } return (total_evicted); } /* * Evict metadata buffers from the cache, such that arcstat_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t arc_evict_meta_only(uint64_t meta_used) { uint64_t total_evicted = 0; int64_t target; /* * If we're over the meta limit, we want to evict enough * metadata to get back under the meta limit. We don't want to * evict so much that we drop the MRU below arc_p, though. If * we're over the meta limit more than we're over arc_p, we * evict some from the MRU here, and some from the MFU below. */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } static uint64_t arc_evict_meta(uint64_t meta_used) { if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) return (arc_evict_meta_only(meta_used)); else return (arc_evict_meta_balanced(meta_used)); } /* * Return the type of the oldest buffer in the given arc state * * This function will select a random sublist of type ARC_BUFC_DATA and * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist * is compared, and the type which contains the "older" buffer will be * returned. */ static arc_buf_contents_t arc_evict_type(arc_state_t *state) { multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; multilist_sublist_t *meta_mls; arc_buf_contents_t type; arc_buf_hdr_t *data_hdr; arc_buf_hdr_t *meta_hdr; /* * We keep the sublist lock until we're finished, to prevent * the headers from being destroyed via arc_evict_state(). */ data_mls = multilist_sublist_lock(data_ml, data_idx); meta_mls = multilist_sublist_lock(meta_ml, meta_idx); /* * These two loops are to ensure we skip any markers that * might be at the tail of the lists due to arc_evict_state(). */ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { if (data_hdr->b_spa != 0) break; } for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { if (meta_hdr->b_spa != 0) break; } if (data_hdr == NULL && meta_hdr == NULL) { type = ARC_BUFC_DATA; } else if (data_hdr == NULL) { ASSERT3P(meta_hdr, !=, NULL); type = ARC_BUFC_METADATA; } else if (meta_hdr == NULL) { ASSERT3P(data_hdr, !=, NULL); type = ARC_BUFC_DATA; } else { ASSERT3P(data_hdr, !=, NULL); ASSERT3P(meta_hdr, !=, NULL); /* The headers can't be on the sublist without an L1 header */ ASSERT(HDR_HAS_L1HDR(data_hdr)); ASSERT(HDR_HAS_L1HDR(meta_hdr)); if (data_hdr->b_l1hdr.b_arc_access < meta_hdr->b_l1hdr.b_arc_access) { type = ARC_BUFC_DATA; } else { type = ARC_BUFC_METADATA; } } multilist_sublist_unlock(meta_mls); multilist_sublist_unlock(data_mls); return (type); } /* * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ static uint64_t arc_evict(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; uint64_t asize = aggsum_value(&arc_sums.arcstat_size); uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used); /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ total_evicted += arc_evict_meta(ameta); /* * Adjust MRU size * * If we're over the target cache size, we want to evict enough * from the list to get back to our target size. We don't want * to evict too much from the MRU, such that it drops below * arc_p. So, if we're over our target cache size more than * the MRU is over arc_p, we'll evict enough to get back to * arc_p here, and then evict more from the MFU below. */ target = MIN((int64_t)(asize - arc_c), (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); /* * If we're below arc_meta_min, always prefer to evict data. * Otherwise, try to satisfy the requested number of bytes to * evict from the type which contains older buffers; in an * effort to keep newer buffers in the cache regardless of their * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from metadata. */ target -= bytes; total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Re-sum ARC stats after the first round of evictions. */ asize = aggsum_value(&arc_sums.arcstat_size); ameta = aggsum_value(&arc_sums.arcstat_meta_used); /* * Adjust MFU size * * Now that we've tried to evict enough from the MRU to get its * size back to arc_p, if we're still above the target cache * size, we evict the rest from the MFU. */ target = asize - arc_c; if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists * * In addition to the above, the ARC also defines target values * for the ghost lists. The sum of the mru list and mru ghost * list should never exceed the target size of the cache, and * the sum of the mru list, mfu list, mru ghost list, and mfu * ghost list should never exceed twice the target size of the * cache. The following logic enforces these limits on the ghost * caches, and evicts from them as needed. */ target = zfs_refcount_count(&arc_mru->arcs_size) + zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than * or equal to arc_c (we enforced this above), which means we * can use the simpler of the two equations below: * * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c * mru ghost + mfu ghost <= arc_c */ target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } void arc_reduce_target_size(int64_t to_free) { uint64_t asize = aggsum_value(&arc_sums.arcstat_size); /* * All callers want the ARC to actually evict (at least) this much * memory. Therefore we reduce from the lower of the current size and * the target size. This way, even if arc_c is much higher than * arc_size (as can be the case after many calls to arc_freed(), we will * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ uint64_t c = MIN(arc_c, asize); if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } else { arc_c = arc_c_min; } if (asize > arc_c) { /* See comment in arc_evict_cb_check() on why lock+flag */ mutex_enter(&arc_evict_lock); arc_evict_needed = B_TRUE; mutex_exit(&arc_evict_lock); zthr_wakeup(arc_evict_zthr); } } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } void arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; #ifdef _KERNEL if ((aggsum_compare(&arc_sums.arcstat_meta_used, arc_meta_limit) >= 0) && zfs_arc_meta_prune) { /* * We are exceeding our meta-data cache limit. * Prune some entries to release holds on meta-data. */ arc_prune_async(zfs_arc_meta_prune); } #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #if defined(_ILP32) /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(zfs_btree_leaf_cache); abd_cache_reap_now(); } /* ARGSUSED */ static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { #ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the * mdb ::arc dcmd and the Linux crash utility. These tools * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this call, the data might be out of date if the * evict thread hasn't been woken recently; but that should * suffice. The arc_state_t structures can be queried * directly if more accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); #endif /* * We have to rely on arc_wait_for_eviction() to tell us when to * evict, rather than checking if we are overflowing here, so that we * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. * If we have become "not overflowing" since arc_wait_for_eviction() * checked, we need to wake it up. We could broadcast the CV here, * but arc_wait_for_eviction() may have not yet gone to sleep. We * would need to use a mutex to ensure that this function doesn't * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. * the arc_evict_lock). However, the lock ordering of such a lock * would necessarily be incorrect with respect to the zthr_lock, * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ return (arc_evict_needed); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ /* ARGSUSED */ static void arc_evict_cb(void *arg, zthr_t *zthr) { uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Evict from cache */ evicted = arc_evict(); /* * If evicted is zero, we couldn't evict anything * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { cv_broadcast(&aw->aew_cv); } arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } /* ARGSUSED */ static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { int64_t free_memory = arc_available_memory(); static int reap_cb_check_counter = 0; /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (!kmem_cache_reap_active() && free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; } /* * Called unconditionally every 60 seconds to reclaim unused * zstd compression and decompression context. This is done * here to avoid the need for an independent thread. */ if (!((reap_cb_check_counter++) % 60)) zfs_zstd_cache_reap_now(); return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ /* ARGSUSED */ static void arc_reap_cb(void *arg, zthr_t *zthr) { int64_t free_memory; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * Kick off asynchronous kmem_reap()'s of all our caches. */ arc_kmem_reap_soon(); /* * Wait at least arc_kmem_cache_reap_retry_ms between * arc_kmem_reap_soon() calls. Without this check it is possible to * end up in a situation where we spend lots of time reaping * caches, while we're near arc_c_min. Waiting here also gives the * subsequent free memory check a chance of finding that the * asynchronous reap has already freed enough memory, and we don't * need to call arc_reduce_target_size(). */ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); /* * Reduce the target size as needed to maintain the amount of free * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional * amount. If free memory is positive but less than the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); /* * Wake reap thread if we do not have any available memory */ if (arc_reclaim_needed()) { zthr_wakeup(arc_reap_zthr); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); if (aggsum_upper_bound(&arc_sums.arcstat_size) >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static arc_ovf_level_t arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; if (!use_reserve) overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { ASSERT(type == ARC_BUFC_DATA); return (abd_alloc(size, B_FALSE)); } } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Wait for the specified amount of data (in bytes) to be evicted from the * ARC, and for there to be sufficient free memory in the system. Waiting for * eviction ensures that the memory used by the ARC decreases. Waiting for * free memory ensures that the system won't run out of free pages, regardless * of ARC behavior and settings. See arc_lowmem_init(). */ void arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: /* * This is a bit racy without taking arc_evict_lock, but the * worst that can happen is we either call zthr_wakeup() extra * time due to race with other thread here, or the set flag * get cleared by arc_evict_cb(), which is unlikely due to * big hysteresis, but also not important since at this level * of overflow the eviction is purely advisory. Same time * taking the global lock here every time without waiting for * the actual eviction creates a significant lock contention. */ if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } return; case ARC_OVF_SEVERE: default: { arc_evict_waiter_t aw; list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); uint64_t last_count = 0; mutex_enter(&arc_evict_lock); if (!list_is_empty(&arc_evict_waiters)) { arc_evict_waiter_t *last = list_tail(&arc_evict_waiters); last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } /* * Note, the last waiter's count may be less than * arc_evict_count if we are low on memory in which * case arc_evict_state_impl() may have deferred * wakeups (but still incremented arc_evict_count). */ aw.aew_count = MAX(last_count, arc_evict_count) + amount; list_insert_tail(&arc_evict_waiters, &aw); arc_set_need_free(); DTRACE_PROBE3(arc__wait__for__eviction, uint64_t, amount, uint64_t, arc_evict_count, uint64_t, aw.aew_count); /* * We will be woken up either when arc_evict_count reaches * aew_count, or when the ARC is no longer overflowing and * eviction completes. * In case of "false" wakeup, we will still be on the list. */ do { cv_wait(&aw.aew_cv, &arc_evict_lock); } while (list_link_active(&aw.aew_node)); mutex_exit(&arc_evict_lock); cv_destroy(&aw.aew_cv); } } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, int alloc_flags) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); if (alloc_flags & ARC_HDR_DO_ADAPT) arc_adapt(size, state); /* * If arc_size is currently overflowing, we must be adding data * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we wait for the eviction thread to * make some progress. We also wait for there to be sufficient free * memory in the system, as measured by arc_free_memory(). * * Specifically, we wait for zfs_arc_eviction_pct percent of the * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(state)) { (void) zfs_refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && hdr->b_l1hdr.b_state == arc_anon && (zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } hdr->b_l1hdr.b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ hdr->b_l1hdr.b_mfu_hits++; ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ new_state = arc_mru; } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { mutex_enter(&buf->b_evict_lock); arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(&buf->b_evict_lock); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); mutex_exit(&buf->b_evict_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { if (buf == NULL) return; bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ /* ARGSUSED */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (BP_IS_PROTECTED(bp)) { hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { void *tmpbuf; tmpbuf = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmpbuf, hdr->b_crypt_hdr.b_mac); abd_return_buf(zio->io_abd, tmpbuf, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } if (!HDR_L2_READING(hdr)) { hdr->b_complevel = zio->io_prop.zp_complevel; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; if (zio->io_error != 0) continue; int error = arc_buf_alloc_impl(hdr, zio->io_spa, &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); /* * Assert non-speculative zios didn't fail because an * encryption key wasn't loaded */ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || error != EACCES); /* * If we failed to decrypt, report an error now (as the zio * layer would have done if it had done the transforms). */ if (error == ECKSUM) { ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } if (error != 0) { /* * Decompression or decryption failed. Set * io_error so that when we call acb_done * (below), we will indicate that the read * failed. Note that in the unusual case * where one callback is compressed and another * uncompressed, we will mark all of them * as failed, even though the uncompressed * one can't actually fail. In this case, * the hdr will not be anonymous, because * if there are multiple callbacks, it's * because multiple threads found the same * arc buf in the hash table. */ zio->io_error = error; } } /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is * in the hash table. This ensures that if there are multiple * callbacks, the hdr is not anonymous. If it were anonymous, * we couldn't use arc_buf_destroy() in the error case below. */ ASSERT(callback_cnt < 2 || hash_lock != NULL); hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* * If arc_buf_alloc_impl() fails during * decompression, the buf will still be * allocated, and needs to be freed here. */ arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_REDACTED(bp)); /* * Normally SPL_FSTRANS will already be set since kernel threads which * expect to call the DMU interfaces will set it when created. System * calls are similarly handled by setting/cleaning the bit in the * registered callback (module/os/.../zfs/zpl_*). * * External consumers such as Lustre which call the exported DMU * interfaces may not have set SPL_FSTRANS. To avoid a deadlock * on the hash_lock always set and clear the bit. */ fstrans_cookie_t cookie = spl_fstrans_mark(); top: /* * Verify the block pointer contents are reasonable. This should * always be the case since the blkptr is protected by a checksum. * However, if there is damage it's desirable to detect this early * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto out; } if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } /* * Determine if we have an L1 cache hit or a cache miss. For simplicity * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); goto out; } ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; acb->acb_zb = *zb; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); if (done && !no_buf) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to * wait for i/o because we did a predictive * prefetch i/o for it, which has completed. */ DTRACE_PROBE1( arc__demand__hit__predictive__prefetch, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { ARCSTAT_BUMP( arcstat_demand_hit_prescient_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); } ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); if (rc == ECKSUM) { /* * Convert authentication and decryption errors * to EIO (and generate an ereport if needed) * before leaving the ARC. */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } } if (rc != 0) { (void) remove_reference(hdr, hash_lock, private); arc_buf_destroy_impl(buf); buf = NULL; } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { rc = SET_ERROR(ENOENT); if (hash_lock != NULL) mutex_exit(hash_lock); goto out; } if (hdr == NULL) { /* * This block is not in the cache or it has * embedded data. */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data * was requested and we didn't have it. If it was * L2-only (and thus didn't have an L1 hdr), * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress * and we are performing another IO to fetch * encrypted data we must wait until the first * IO completes so as not to confuse * arc_read_done(). This should be very rare * and so the performance impact shouldn't * matter. */ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } /* * This is a delicate dance that we play here. * This hdr might be in the ghost list so we access * it to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); } arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); hdr_abd = hdr->b_l1hdr.b_pabd; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW_COMPRESS; } /* * For authenticated bp's, we do not ask the ZIO layer * to authenticate them since this will cause the entire * IO to fail if the key isn't loaded. Instead, we * defer authentication until arc_buf_fill(), which will * verify the data when the key is available. */ if (BP_IS_AUTHENTICATED(bp)) zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss or a blkptr * with embedded data. Try again in L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); /* * Skip ARC stat bump for block pointers with embedded * data. The data are read from the blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); } /* Check if the spa even has l2 configured */ const boolean_t spa_has_l2 = l2arc_ndev != 0 && spa->spa_l2cache.sav_count > 0; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); hdr->b_l2hdr.b_hits++; cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * When Compressed ARC is disabled, but the * L2ARC block is compressed, arc_hdr_size() * will have returned LSIZE rather than PSIZE. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr) && HDR_GET_PSIZE(hdr) != 0) { size = HDR_GET_PSIZE(hdr); } asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); /* * Only a spa with l2 should contribute to l2 * miss stats. (Including the case of having a * faulted cache device - that's also a miss.) */ if (spa_has_l2) { /* * Skip ARC stat bump for block pointers with * embedded data. The data are read from the * blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } } rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); zfs_refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); zfs_refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(zfs_refcount_count(&p->p_refcnt)); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has a reference (i.e. a dedup-ed, * dmu_sync-ed block). If this block is being prefetched, then it * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr * until the I/O completes. A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size, arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many( &state->arcs_esize[type], arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) hdr->b_crypt_hdr.b_ebufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); /* if this is the last uncompressed buf free the checksum */ if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); mutex_exit(hash_lock); /* * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; if (ARC_BUF_ENCRYPTED(buf)) nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); (void) zfs_refcount_add_many(&arc_anon->arcs_size, arc_buf_size(buf), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } /* * If this block was written for raw encryption but the zio layer * ended up only authenticating it, adjust the buffer flags now. */ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* this must be done after the buffer flags are adjusted */ arc_cksum_compute(buf); enum zio_compress compress; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; /* * Fill the hdr with data. If the buffer is encrypted we have no choice * but to copy the data into b_radb. If the hdr is compressed, the data * we want is available from the zio, otherwise we can take it from * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!abd_size_alloc_linear(arc_buf_size(buf)) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); arc_share_buf(hdr, buf); } out: arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt, ZIO_DATA_SALT_LEN); bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv, ZIO_DATA_IV_LEN); bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); if (!(zio_flags & ZIO_FLAG_RAW)) arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * * In the case of one pool being built on another pool, we want * to make sure we don't end up throttling the lower (backing) * pool when the upper pool is the majority contributor to dirty * data. To insure we make forward progress during throttling, we * also check the current pool's net dirty data and only throttle * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty * data in the cache. * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( &arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", (u_longlong_t)arc_tempreserve >> 10, (u_longlong_t)meta_esize >> 10, (u_longlong_t)data_esize >> 10, (u_longlong_t)reserve >> 10, (u_longlong_t)rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = zfs_refcount_count(&state->arcs_size); evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_hits); as->arcstat_mru_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_ghost_hits); as->arcstat_mfu_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = wmsum_value(&arc_sums.arcstat_mutex_miss); as->arcstat_access_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_access_skip); as->arcstat_evict_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_skip); as->arcstat_evict_not_enough.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_not_enough); as->arcstat_evict_l2_cached.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_cached); as->arcstat_evict_l2_eligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible); as->arcstat_evict_l2_eligible_mfu.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu); as->arcstat_evict_l2_eligible_mru.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru); as->arcstat_evict_l2_ineligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_chains); as->arcstat_size.value.ui64 = aggsum_value(&arc_sums.arcstat_size); as->arcstat_compressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_compressed_size); as->arcstat_uncompressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_uncompressed_size); as->arcstat_overhead_size.value.ui64 = wmsum_value(&arc_sums.arcstat_overhead_size); as->arcstat_hdr_size.value.ui64 = wmsum_value(&arc_sums.arcstat_hdr_size); as->arcstat_data_size.value.ui64 = wmsum_value(&arc_sums.arcstat_data_size); as->arcstat_metadata_size.value.ui64 = wmsum_value(&arc_sums.arcstat_metadata_size); as->arcstat_dbuf_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dbuf_size); #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + aggsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); as->arcstat_dnode_size.value.ui64 = aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_hits); as->arcstat_l2_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_misses); as->arcstat_l2_prefetch_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_prefetch_asize); as->arcstat_l2_mru_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mru_asize); as->arcstat_l2_mfu_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mfu_asize); as->arcstat_l2_bufc_data_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize); as->arcstat_l2_bufc_metadata_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize); as->arcstat_l2_feeds.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_feeds); as->arcstat_l2_rw_clash.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rw_clash); as->arcstat_l2_read_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_read_bytes); as->arcstat_l2_write_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_write_bytes); as->arcstat_l2_writes_sent.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_sent); as->arcstat_l2_writes_done.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_done); as->arcstat_l2_writes_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_error); as->arcstat_l2_writes_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry); as->arcstat_l2_evict_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry); as->arcstat_l2_evict_reading.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_reading); as->arcstat_l2_evict_l1cached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_l1cached); as->arcstat_l2_free_on_write.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_free_on_write); as->arcstat_l2_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_abort_lowmem); as->arcstat_l2_cksum_bad.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_cksum_bad); as->arcstat_l2_io_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_io_error); as->arcstat_l2_lsize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_lsize); as->arcstat_l2_psize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_psize); as->arcstat_l2_hdr_size.value.ui64 = aggsum_value(&arc_sums.arcstat_l2_hdr_size); as->arcstat_l2_log_blk_writes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_writes); as->arcstat_l2_log_blk_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_asize); as->arcstat_l2_log_blk_count.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_count); as->arcstat_l2_rebuild_success.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_success); as->arcstat_l2_rebuild_abort_unsupported.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported); as->arcstat_l2_rebuild_abort_io_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors); as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); as->arcstat_l2_rebuild_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem); as->arcstat_l2_rebuild_size.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_size); as->arcstat_l2_rebuild_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_asize); as->arcstat_l2_rebuild_bufs.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs); as->arcstat_l2_rebuild_bufs_precached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached); as->arcstat_l2_rebuild_log_blks.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks); as->arcstat_memory_throttle_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_throttle_count); as->arcstat_memory_direct_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_direct_count); as->arcstat_memory_indirect_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_indirect_count); as->arcstat_memory_all_bytes.value.ui64 = arc_all_memory(); as->arcstat_memory_free_bytes.value.ui64 = arc_free_memory(); as->arcstat_memory_available_bytes.value.i64 = arc_available_memory(); as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = aggsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = wmsum_value(&arc_sums.arcstat_cached_only_in_progress); as->arcstat_abd_chunk_waste_size.value.ui64 = wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size); return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } static unsigned int arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) { panic("Header %p insert into arc_l2c_only %p", obj, ml); } #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ if ((do_warn) && (tuning) && ((tuning) != (value))) { \ cmn_err(CE_WARN, \ "ignoring tunable %s (using %llu instead)", \ (#tuning), (u_longlong_t)(value)); \ } \ } while (0) /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Can also be * called explicitly by param_set_arc_*() functions when ARC tunables are * updated manually. Non-zero zfs_* values which differ from the currently set * values will be applied. */ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); unsigned long limit; /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); arc_p = (arc_c >> 1); if (arc_meta_limit > arc_c_max) arc_meta_limit = arc_c_max; if (arc_dnode_size_limit > arc_meta_limit) arc_dnode_size_limit = arc_meta_limit; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 16M - */ if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_meta_min <= arc_c_max)) { arc_meta_min = zfs_arc_meta_min; if (arc_meta_limit < arc_meta_min) arc_meta_limit = arc_meta_min; if (arc_dnode_size_limit < arc_meta_min) arc_dnode_size_limit = arc_meta_min; } WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); /* Valid range: - */ limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; if ((limit != arc_meta_limit) && (limit >= arc_meta_min) && (limit <= arc_c_max)) arc_meta_limit = limit; WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); /* Valid range: - */ limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; if ((limit != arc_dnode_size_limit) && (limit >= arc_meta_min) && (limit <= arc_meta_limit)) arc_dnode_size_limit = limit; WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N */ if (zfs_arc_p_min_shift) arc_p_min_shift = zfs_arc_p_min_shift; /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; /* Valid range: 1 - N ms */ if (zfs_arc_min_prescient_prefetch_ms) { arc_min_prescient_prefetch_ms = zfs_arc_min_prescient_prefetch_ms; } /* Valid range: 0 - 100 */ if ((zfs_arc_lotsfree_percent >= 0) && (zfs_arc_lotsfree_percent <= 100)) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_init(void) { multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. Special index function asserts that. */ multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_l2c_multilist_index_func); multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_l2c_multilist_index_func); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size); zfs_refcount_create(&arc_mru->arcs_size); zfs_refcount_create(&arc_mru_ghost->arcs_size); zfs_refcount_create(&arc_mfu->arcs_size); zfs_refcount_create(&arc_mfu_ghost->arcs_size); zfs_refcount_create(&arc_l2c_only->arcs_size); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); wmsum_init(&arc_sums.arcstat_evict_skip, 0); wmsum_init(&arc_sums.arcstat_evict_not_enough, 0); wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); wmsum_init(&arc_sums.arcstat_compressed_size, 0); wmsum_init(&arc_sums.arcstat_uncompressed_size, 0); wmsum_init(&arc_sums.arcstat_overhead_size, 0); wmsum_init(&arc_sums.arcstat_hdr_size, 0); wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); aggsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0); wmsum_init(&arc_sums.arcstat_l2_feeds, 0); wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0); wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0); wmsum_init(&arc_sums.arcstat_l2_writes_done, 0); wmsum_init(&arc_sums.arcstat_l2_writes_error, 0); wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0); wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0); wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0); wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0); wmsum_init(&arc_sums.arcstat_l2_io_error, 0); wmsum_init(&arc_sums.arcstat_l2_lsize, 0); wmsum_init(&arc_sums.arcstat_l2_psize, 0); aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0); wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0); wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); aggsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; } static void arc_state_fini(void) { zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size); zfs_refcount_destroy(&arc_mru->arcs_size); zfs_refcount_destroy(&arc_mru_ghost->arcs_size); zfs_refcount_destroy(&arc_mfu->arcs_size); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); zfs_refcount_destroy(&arc_l2c_only->arcs_size); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); wmsum_fini(&arc_sums.arcstat_evict_skip); wmsum_fini(&arc_sums.arcstat_evict_not_enough); wmsum_fini(&arc_sums.arcstat_evict_l2_cached); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); wmsum_fini(&arc_sums.arcstat_compressed_size); wmsum_fini(&arc_sums.arcstat_uncompressed_size); wmsum_fini(&arc_sums.arcstat_overhead_size); wmsum_fini(&arc_sums.arcstat_hdr_size); wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); aggsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize); wmsum_fini(&arc_sums.arcstat_l2_mru_asize); wmsum_fini(&arc_sums.arcstat_l2_mfu_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize); wmsum_fini(&arc_sums.arcstat_l2_feeds); wmsum_fini(&arc_sums.arcstat_l2_rw_clash); wmsum_fini(&arc_sums.arcstat_l2_read_bytes); wmsum_fini(&arc_sums.arcstat_l2_write_bytes); wmsum_fini(&arc_sums.arcstat_l2_writes_sent); wmsum_fini(&arc_sums.arcstat_l2_writes_done); wmsum_fini(&arc_sums.arcstat_l2_writes_error); wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_reading); wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached); wmsum_fini(&arc_sums.arcstat_l2_free_on_write); wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_cksum_bad); wmsum_fini(&arc_sums.arcstat_l2_io_error); wmsum_fini(&arc_sums.arcstat_l2_lsize); wmsum_fini(&arc_sums.arcstat_l2_psize); aggsum_fini(&arc_sums.arcstat_l2_hdr_size); wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes); wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize); wmsum_fini(&arc_sums.arcstat_l2_log_blk_count); wmsum_fini(&arc_sums.arcstat_l2_rebuild_success); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_rebuild_size); wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached); wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks); wmsum_fini(&arc_sums.arcstat_memory_throttle_count); wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); aggsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); } uint64_t arc_target_bytes(void) { return (arc_c); } void arc_set_limits(uint64_t allmem) { /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); /* How to set default max varies by platform. */ arc_c_max = arc_default_max(arc_c_min, allmem); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; #if defined(_KERNEL) arc_lowmem_init(); #endif arc_set_limits(allmem); #ifdef _KERNEL /* * If zfs_arc_max is non-zero at init, meaning it was set in the kernel * environment before the module was loaded, don't block setting the * maximum because it is less than arc_c_min, instead, reset arc_c_min * to a lower value. * zfs_arc_min will be handled by arc_tuning_update(). */ if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX && zfs_arc_max < allmem) { arc_c_max = zfs_arc_max; if (arc_c_min >= arc_c_max) { arc_c_min = MAX(zfs_arc_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); } } #else /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif arc_c = arc_c_min; arc_p = (arc_c >> 1); /* Set min to 1/2 of arc_c_min */ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; /* * Set arc_meta_limit to a percent of arc_c_max with a floor of * arc_meta_min, and a ceiling of arc_c_max. */ percent = MIN(zfs_arc_meta_limit_percent, 100); arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_size_limit = (percent * arc_meta_limit) / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_register_hotplug(); arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } arc_evict_zthr = zthr_create("arc_evict", arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } if (zfs_wrlog_data_max == 0) { /* * dp_wrlog_total is reduced for each txg at the end of * spa_sync(). However, dp_dirty_total is reduced every time * a block is written out. Thus under normal operation, * dp_wrlog_total could grow 2 times as big as * zfs_dirty_data_max. */ zfs_wrlog_data_max = zfs_dirty_data_max * 2; } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL arc_lowmem_fini(); #endif /* _KERNEL */ /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); while ((p = list_head(&arc_prune_list)) != NULL) { list_remove(&arc_prune_list, p); zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); /* * Free any buffers that were tagged for destruction. This needs * to occur before arc_state_fini() runs and destroys the aggsum * values which are updated when freeing scatter ABDs. */ l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may * trigger the release of kmem magazines, which can callback to * arc_space_return() which accesses aggsums freed in act_state_fini(). */ buf_fini(); arc_state_fini(); arc_unregister_hotplug(); /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any * wakeup() signals after they are destroyed. */ zthr_destroy(arc_evict_zthr); zthr_destroy(arc_reap_zthr); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. * * L2ARC persistence: * * When writing buffers to L2ARC, we periodically add some metadata to * make sure we can pick them up after reboot, thus dramatically reducing * the impact that any downtime has on the performance of storage systems * with large caches. * * The implementation works fairly simply by integrating the following two * modifications: * * *) When writing to the L2ARC, we occasionally write a "l2arc log block", * which is an additional piece of metadata which describes what's been * written. This allows us to rebuild the arc_buf_hdr_t structures of the * main ARC buffers. There are 2 linked-lists of log blocks headed by * dh_start_lbps[2]. We alternate which chain we append to, so they are * time-wise and offset-wise interleaved, but that is an optimization rather * than for correctness. The log block also includes a pointer to the * previous block in its chain. * * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device * for our header bookkeeping purposes. This contains a device header, * which contains our top-level reference structures. We update it each * time we write a new log block, so that we're able to locate it in the * L2ARC device. If this write results in an inconsistent device header * (e.g. due to power failure), we detect this by verifying the header's * checksum and simply fail to reconstruct the L2ARC after reboot. * * Implementation diagram: * * +=== L2ARC device (not to scale) ======================================+ * | ___two newest log block pointers__.__________ | * | / \dh_start_lbps[1] | * | / \ \dh_start_lbps[0]| * |.___/__. V V | * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| * || hdr| ^ /^ /^ / / | * |+------+ ...--\-------/ \-----/--\------/ / | * | \--------------/ \--------------/ | * +======================================================================+ * * As can be seen on the diagram, rather than using a simple linked list, * we use a pair of linked lists with alternating elements. This is a * performance enhancement due to the fact that we only find out the * address of the next log block access once the current block has been * completely read in. Obviously, this hurts performance, because we'd be * keeping the device's I/O queue at only a 1 operation deep, thus * incurring a large amount of I/O round-trip latency. Having two lists * allows us to fetch two log blocks ahead of where we are currently * rebuilding L2ARC buffers. * * On-device data structures: * * L2ARC device header: l2arc_dev_hdr_phys_t * L2ARC log block: l2arc_log_blk_phys_t * * L2ARC reconstruction: * * When writing data, we simply write in the standard rotary fashion, * evicting buffers as we go and simply writing new data over them (writing * a new log block every now and then). This obviously means that once we * loop around the end of the device, we will start cutting into an already * committed log block (and its referenced data buffers), like so: * * current write head__ __old tail * \ / * V V * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> * ^ ^^^^^^^^^___________________________________ * | \ * <> may overwrite this blk and/or its bufs --' * * When importing the pool, we detect this situation and use it to stop * our scanning process (see l2arc_rebuild). * * There is one significant caveat to consider when rebuilding ARC contents * from an L2ARC device: what about invalidated buffers? Given the above * construction, we cannot update blocks which we've already written to amend * them to remove buffers which were invalidated. Thus, during reconstruction, * we might be populating the cache with buffers for data that's not on the * main pool anymore, or may have been overwritten! * * As it turns out, this isn't a problem. Every arc_read request includes * both the DVA and, crucially, the birth TXG of the BP the caller is * looking for. So even if the cache were populated by completely rotten * blocks for data that had been long deleted and/or overwritten, we'll * never actually return bad data from the cache, since the DVA with the * birth TXG uniquely identify a block in space and time - once created, * a block is immutable on disk. The worst thing we have done is wasted * some time and memory at l2arc rebuild to reconstruct outdated ARC * entries that will get dropped from the l2arc as it is being updated * with new blocks. * * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write * hand are not restored. This is done by saving the offset (in bytes) * l2arc_evict() has evicted to in the L2ARC device header and taking it * into account when restoring buffers. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; tsize = size + l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) tsize += MAX(64 * 1024 * 1024, (tsize * l2arc_trim_ahead) / 100); if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (arc_warm == B_FALSE) size += l2arc_write_boost; } return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } /* * Free the allocated abd buffers for writing the log blocks. * If the zio failed reclaim the allocated space and remove the * pointers to these log blocks from the log block pointer list * of the L2ARC device. */ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { abd_free(abd_buf->abd); zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); /* * L2BLK_GET_PSIZE returns aligned size for log * blocks. */ uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); bytes_dropped += asize; ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_writes_error); /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the * log block pointers in the device header. */ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); for (int i = 0; i < 2; i++) { if (lb_ptr_buf == NULL) { /* * If the list is empty zero out the device * header. Otherwise zero out the second log * block pointer in the header. */ if (i == 0) { bzero(l2dhdr, dev->l2ad_dev_hdr_asize); } else { bzero(&l2dhdr->dh_start_lbps[i], sizeof (l2arc_log_blkptr_t)); } break; } bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); } } ARCSTAT_BUMP(arcstat_l2_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } static int l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) { int ret; spa_t *spa = zio->io_spa; arc_buf_hdr_t *hdr = cb->l2rcb_hdr; blkptr_t *bp = zio->io_bp; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* * ZIL data is never be written to the L2ARC, so we don't need * special handling for its unique MAC storage. */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the data was encrypted, decrypt it now. Note that * we must check the bp here and not the hdr, since the * hdr does not have its encryption parameters updated * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, hdr->b_l1hdr.b_pabd, &no_crypt); if (ret != 0) { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); goto error; } /* * If we actually performed decryption, replace b_pabd * with the decrypted data. Otherwise we can just throw * our decryption buffer away. */ if (!no_crypt) { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = eabd; zio->io_abd = eabd; } else { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); } } /* * If the L2ARC block was compressed, but ARC compression * is disabled we decompress the data into a new buffer and * replace the existing data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; zio->io_abd = cabd; zio->io_size = HDR_GET_LSIZE(hdr); } return (0); error: return (ret); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { int tfm_error = 0; l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { if (using_rdata) { abd_copy(hdr->b_crypt_hdr.b_rabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } else { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); if (using_rdata) { ASSERT(HDR_HAS_RABD(hdr)); zio->io_abd = zio->io_orig_abd = hdr->b_crypt_hdr.b_rabd; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); /* * b_rabd will always match the data as it exists on disk if it is * being used. Therefore if we are reading into b_rabd we do not * attempt to untransform the data. */ if (valid_cksum && !using_rdata) tfm_error = l2arc_untransform(zio, cb); if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update * ARC header with the new ZIO pointer to be used * by zio_change_priority() in arc_read(). */ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; acb != NULL; acb = acb->acb_next) acb->acb_zio_head = zio; mutex_exit(hash_lock); zio_nowait(zio); } else { mutex_exit(hash_lock); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + dev->l2ad_log_entries - 1) / dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); } } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; buflist = &dev->l2ad_buflist; /* * We need to add in the worst case scenario of log block overhead. */ distance += l2arc_log_blk_overhead(distance, dev); if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the write size, whichever is greater. */ distance += MAX(64 * 1024 * 1024, (distance * l2arc_trim_ahead) / 100); } top: rerun = B_FALSE; if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands * to the start and iterate. This iteration does not * happen indefinitely as we make sure in * l2arc_write_size() that when the write hand is reset, * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); if (!all) { /* * This check has to be placed after deciding whether to * iterate (rerun). */ if (dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. We have already trimmmed the * whole device. */ goto out; } else { /* * Trim the space to be evicted. */ if (vd->vdev_has_trim && dev->l2ad_evict < taddr && l2arc_trim_ahead > 0) { /* * We have to drop the spa_config lock because * vdev_trim_range() will acquire it. * l2ad_evict already accounts for the label * size. To prevent vdev_trim_ranges() from * adding it again, we subtract it from * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); vdev_trim_simple(vd, dev->l2ad_evict - VDEV_LABEL_START_SIZE, taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); } /* * When rebuilding L2ARC we retrieve the evict hand * from the header of the device. Of note, l2arc_evict() * does not actually delete buffers from the cache * device, but trimming may do so depending on the * hardware implementation. Thus keeping track of the * evict hand is useful. */ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); } } retry: mutex_enter(&dev->l2ad_mtx); /* * We have to account for evicted log blocks. Run vdev_space_update() * on log blocks whose offset (in bytes) is before the evicted offset * (in bytes) by searching in the list of pointers to log blocks * present in the L2ARC device. */ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; lb_ptr_buf = lb_ptr_buf_prev) { lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE( (lb_ptr_buf->lb_ptr)->lbp_prop); /* * We don't worry about log blocks left behind (ie * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto retry; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); out: /* * We need to check if we evict all buffers, otherwise we may iterate * unnecessarily. */ if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } if (!all) { /* * In case of cache device removal (all) the following * assertions may be violated without functional consequences * as the device is about to be removed. */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } } /* * Handle any abd transforms that might be required for writing to the L2ARC. * If successful, this function will always return an abd with the data * transformed as it is on disk in a new abd of asize bytes. */ static int l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t size = arc_hdr_size(hdr); boolean_t ismd = HDR_ISTYPE_METADATA(hdr); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); dsl_crypto_key_t *dck = NULL; uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; boolean_t no_crypt = B_FALSE; ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) || HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); ASSERT3U(psize, <=, asize); /* * If this data simply needs its own buffer, we simply allocate it * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { ASSERT3U(asize, >=, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto out; } if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && !HDR_ENCRYPTED(hdr)) { ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { cabd = abd_alloc_for_io(asize, ismd); tmp = abd_borrow_buf(cabd, asize); psize = zio_compress_data(compress, to_write, tmp, size, hdr->b_complevel); if (psize >= size) { abd_return_buf(cabd, tmp, asize); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) bzero((char *)tmp + psize, asize - psize); psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, asize); to_write = cabd; } encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); /* * If the dataset was disowned before the buffer * made it to this point, the key to re-encrypt * it won't be available. In this case we simply * won't write the buffer to the L2ARC. */ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) abd_copy(eabd, to_write, psize); if (psize != asize) abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) abd_free(cabd); to_write = eabd; } out: ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); *abd_out = to_write; return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); if (cabd != NULL) abd_free(cabd); if (eabd != NULL) abd_free(eabd); *abd_out = NULL; return (ret); } static void l2arc_blk_fetch_done(zio_t *zio) { l2arc_read_callback_t *cb; cb = zio->io_private; if (cb->l2rcb_abd != NULL) abd_free(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment and the * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_lsize, headroom; boolean_t full; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_lsize = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. */ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* * If pass == 1 or 3, we cache MRU metadata and data * respectively. */ if (l2arc_mfuonly) { if (pass == 1 || pass == 3) continue; } multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); if ((write_asize + asize) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3U(arc_hdr_size(hdr), >, 0); /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the * buf consumer. To ensure that this copy will be * available for the lifetime of the ZIO and be cleaned * up afterwards, we add it to the l2arc_free_on_write * queue. If we need to apply any transforms to the * data (compression, encryption) we will also need the * extra buffer. */ if (HDR_HAS_RABD(hdr) && psize == asize) { to_write = hdr->b_crypt_hdr.b_rabd; } else if ((HDR_COMPRESSION_ENABLED(hdr) || HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { int ret; arc_buf_contents_t type = arc_buf_type(hdr); ret = l2arc_apply_transforms(spa, hdr, asize, &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); continue; } l2arc_free_abd_on_write(to_write, asize, type); } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; /* * Create a list to save allocated abd buffers * for l2arc_log_blk_commit(). */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); /* * Append buf info to current log and commit if full. * arcstat_l2_{size,asize} kstats are updated * internally. */ if (l2arc_log_blk_insert(dev, hdr)) l2arc_log_blk_commit(dev, pio, cb); zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); /* * Although we did not write any buffers l2ad_evict may * have advanced. */ if (dev->l2ad_evict != l2dhdr->dh_evict) l2arc_dev_hdr_update(dev); return (0); } if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block * pointers in the device header. */ l2arc_dev_hdr_update(dev); return (write_asize); } static boolean_t l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ /* ARGSUSED */ static void l2arc_feed_thread(void *unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { return (l2arc_vdev_get(vd) != NULL); } /* * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev); } static void l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; spa_t *spa = dev->l2ad_spa; /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block * depends on the amount of its log entries. We always write log blocks * with 1022 entries. How many of them are committed or restored depends * on the size of the L2ARC device. Thus the maximum payload of * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device * is less than that, we reduce the amount of committed and restored * log entries per block so as to enable persistence. */ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { dev->l2ad_log_entries = 0; } else { dev->l2ad_log_entries = MIN((dev->l2ad_end - dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, L2ARC_LOG_BLK_MAX_ENTRIES); } /* * Read the device header, if an error is returned do not rebuild L2ARC. */ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, * we should evict all ARC buffers and pointers to log blocks * and reclaim their space before restoring its contents to * L2ARC. */ if (reopen) { if (!l2arc_rebuild_enabled) { return; } else { l2arc_evict(dev, 0, B_TRUE); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } } /* * Just mark the device as pending for a rebuild. We won't * be starting a rebuild in line here as it would block pool * import. Instead spa_load_impl will hand that off to an * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* * In this case TRIM the whole device if l2arc_trim_ahead > 0, * otherwise create a new header. We zero out the memory holding * the header to reset dh_start_lbps. If we TRIM the whole * device the new header will be written by * vdev_trim_l2arc_thread() at the end of the TRIM to update the * trim_state in the header too. When reading the header, if * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 * we opt to TRIM the whole device again. */ if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { bzero(l2dhdr, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); /* * This is a list of pointers to log blocks that are still present * on the device. */ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), offsetof(l2arc_lb_ptr_buf_t, node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); zfs_refcount_create(&adddev->l2ad_lb_asize); zfs_refcount_create(&adddev->l2ad_lb_count); /* * Decide if dev is eligible for L2ARC rebuild or whole device * trimming. This has to happen before the device is added in the * cache device list and l2arc_dev_mtx is released. Otherwise * l2arc_feed_thread() might already start writing on the * device. */ l2arc_rebuild_dev(adddev, B_FALSE); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen() * in case of onlining a cache device. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) { l2arc_dev_t *dev = NULL; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); /* * In contrast to l2arc_add_vdev() we do not have to worry about * l2arc_feed_thread() invalidating previous content when onlining a * cache device. The device parameters (l2ad*) are not cleared when * offlining the device and writing new buffers will not invalidate * all previous content. In worst case only buffers that have not had * their log block written to the device will be lost. * When onlining the cache device (ie offline->online without exporting * the pool in between) this happens: * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev() * | | * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild * is set to B_TRUE we might write additional buffers to the device. */ l2arc_rebuild_dev(dev, reopen); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* * Cancel any ongoing or scheduled rebuild. */ mutex_enter(&l2arc_rebuild_thr_lock); if (remdev->l2ad_rebuild_began == B_TRUE) { remdev->l2ad_rebuild_cancel = B_TRUE; while (remdev->l2ad_rebuild == B_TRUE) cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } mutex_exit(&l2arc_rebuild_thr_lock); /* * Remove device from global list */ mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); zfs_refcount_destroy(&remdev->l2ad_lb_asize); zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_rebuild_thr_lock); cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } /* * Punches out rebuild threads for the L2ARC devices in a spa. This should * be called after pool import from the spa async thread, since starting * these threads directly from spa_import() will make them part of the * "zpool import" context and delay process exit (and thus pool import). */ void l2arc_spa_rebuild_start(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off rebuild threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) { /* Don't attempt a rebuild if the vdev is UNAVAIL */ continue; } mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild_began = B_TRUE; (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, dev, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&l2arc_rebuild_thr_lock); } } /* * Main entry point for L2ARC rebuilding. */ static void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); } /* * This function implements the actual L2ARC metadata rebuild. It: * starts reading the log block chain and restores each block's contents * to memory (reconstructing arc_buf_hdr_t's). * * Operation stops under any of the following conditions: * * 1) We reach the end of the log block chain. * 2) We encounter *any* error condition (cksum errors, io errors) */ static int l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; l2arc_log_blkptr_t lbps[2]; l2arc_lb_ptr_buf_t *lb_ptr_buf; boolean_t lock_held; this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); /* * We prevent device removal while issuing reads to the device, * then during the rebuilding phases we drop this lock again so * that a spa_unload or device remove can be initiated - this is * safe, because the spa will signal us to stop before removing * our device and wait for us to stop. */ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); lock_held = B_TRUE; /* * Retrieve the persistent L2ARC device state. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; vd->vdev_trim_state = l2dhdr->dh_trim_state; /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. */ if (!l2arc_rebuild_enabled) goto out; /* Prepare the rebuild process */ bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { if (!l2arc_log_blkptr_valid(dev, &lbps[0])) break; if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], this_lb, next_lb, this_io, &next_io)) != 0) goto out; /* * Our memory pressure valve. If the system is running low * on memory, rather than swamping memory with new ARC buf * hdrs, we opt not to rebuild the L2ARC. At this point, * however, we have already set up our L2ARC dev to chain in * new metadata log blocks, so the user may choose to offline/ * online the L2ARC dev at a later time (or re-import the pool) * to reconstruct it (when there's less memory pressure). */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); cmn_err(CE_NOTE, "System running low on memory, " "aborting L2ARC rebuild."); err = SET_ERROR(ENOMEM); goto out; } spa_config_exit(spa, SCL_L2ARC, vd); lock_held = B_FALSE; /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of * pointers to log blocks present in the L2ARC device. */ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); bcopy(&lbps[0], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: * * l2ad_hand l2ad_evict * V V * l2ad_start |=======================================| l2ad_end * -----|||----|||---|||----||| * (3) (2) (1) (0) * ---|||---|||----|||---||| * (7) (6) (5) (4) * * In this situation the pointer of log block (4) passes * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check * whether l2ad_evict lies in between the payload starting * offset of the next log block (lbps[1].lbp_payload_start) * and the payload starting offset of the present log block * (lbps[0].lbp_payload_start). If true and this isn't the * first pass, we are looping from the beginning and we should * stop. */ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev->l2ad_evict) && !dev->l2ad_first) goto out; cond_resched(); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild = B_FALSE; cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; } mutex_exit(&l2arc_rebuild_thr_lock); if (spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) { lock_held = B_TRUE; break; } /* * L2ARC config lock held by somebody in writer, * possibly due to them trying to remove us. They'll * likely to want us to shut down, so after a little * delay, we check l2ad_rebuild_cancel and retry * the lock again. */ delay(1); } /* * Continue with the next log block. */ lbps[0] = lbps[1]; lbps[1] = this_lb->lb_prev_lbp; PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); out: if (next_io != NULL) l2arc_log_blk_fetch_abort(next_io); vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); spa_history_log_internal(spa, "L2ARC rebuild", NULL, "successful, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { /* * No error but also nothing restored, meaning the lbps array * in the device header points to invalid/non-present log * blocks. Reset the header. */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); bzero(l2dhdr, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); } else if (err == ECANCELED) { /* * In case the rebuild was canceled do not log to spa history * log as the pool may be in the process of being removed. */ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) spa_config_exit(spa, SCL_L2ARC, vd); return (err); } /* * Attempts to read the device header on the provided L2ARC device and writes * it to `hdr'. On success, this function returns 0, otherwise the appropriate * error code is returned. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev) { int err; uint64_t guid; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); return (err); } if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict) || (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another * version of persistent L2ARC. */ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); return (SET_ERROR(ENOTSUP)); } return (0); } /* * Reads L2ARC log blocks from storage and validates their contents. * * This function implements a simple fetcher to make sure that while * we're processing one buffer the L2ARC is already fetching the next * one in the chain. * * The arguments this_lp and next_lp point to the current and next log block * address in the block chain. Similarly, this_lb and next_lb hold the * l2arc_log_blk_phys_t's of the current and next L2ARC blk. * * The `this_io' and `next_io' arguments are used for block fetching. * When issuing the first blk IO during rebuild, you should pass NULL for * `this_io'. This function will then issue a sync IO to read the block and * also issue an async IO to fetch the next block in the block chain. The * fetched IO is returned in `next_io'. On subsequent calls to this * function, pass the value returned in `next_io' from the previous call * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. * Prior to the call, you should initialize your `next_io' pointer to be * NULL. If no fetch IO was issued, the pointer is left set at NULL. * * On success, this function returns 0, otherwise it returns an appropriate * error code. On error the fetching IO is aborted and cleared before * returning from this function. Therefore, if we return `success', the * caller can assume that we have taken care of cleanup of fetch IOs. */ static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io) { int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); ASSERT(next_io != NULL && *next_io == NULL); ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); /* * Check to see if we have issued the IO for this log block in a * previous run. If not, this is the first call, so issue it now. */ if (this_io == NULL) { this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, this_lb); } /* * Peek to see if we can start issuing the next IO immediately. */ if (l2arc_log_blkptr_valid(dev, next_lbp)) { /* * Start issuing IO for the next log block early - this * should help keep the L2ARC device busy while we * decompress and restore this log block. */ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, next_lb); } /* Wait for the IO to read this log block to complete */ if ((err = zio_wait(this_io)) != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " "offset: %llu, vdev guid: %llu", err, (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid); goto cleanup; } /* * Make sure the buffer checks out. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid, (u_longlong_t)dev->l2ad_hand, (u_longlong_t)dev->l2ad_evict); err = SET_ERROR(ECKSUM); goto cleanup; } /* Now we can take our time decoding this buffer */ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: abd = abd_alloc_for_io(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; default: err = SET_ERROR(EINVAL); goto cleanup; } if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(this_lb, sizeof (*this_lb)); if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { err = SET_ERROR(EINVAL); goto cleanup; } cleanup: /* Abort an in-flight fetch I/O in case of error */ if (err != 0 && *next_io != NULL) { l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } if (abd != NULL) abd_free(abd); return (err); } /* * Restores the payload of a log block to ARC. This creates empty ARC hdr * entries which only contain an l2arc hdr, essentially restoring the * buffers to their L2ARC evicted state. This function also updates space * usage on the L2ARC vdev to make sure it tracks restored buffers. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; /* * Usually arc_adapt() is called only for data, not headers, but * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); for (int i = log_entries - 1; i >= 0; i--) { /* * Restore goes in the reverse temporal direction to preserve * correct temporal ordering of buffers in the l2ad_buflist. * l2arc_hdr_restore also does a list_insert_tail instead of * list_insert_head on the l2ad_buflist: * * LIST l2ad_buflist LIST * HEAD <------ (time) ------ TAIL * direction +-----+-----+-----+-----+-----+ direction * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild * fill +-----+-----+-----+-----+-----+ * ^ ^ * | | * | | * l2arc_feed_thread l2arc_rebuild * will place new bufs here restores bufs here * * During l2arc_rebuild() the device is not used by * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); asize += vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } /* * Restores a single ARC buf hdr from a log entry. The ARC buffer is put * into a state indicating that it has been evicted to L2ARC. */ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) { arc_buf_hdr_t *hdr, *exists; kmutex_t *hash_lock; arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); uint64_t asize; /* * Do all the allocation before grabbing any locks, this lets us * sleep if memory is full and we don't have to deal with failed * allocations. */ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop), L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to * avoid underflow since the latter also calls vdev_space_update(). */ l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); mutex_exit(&dev->l2ad_mtx); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* Buffer was already cached, no need to restore it. */ arc_hdr_destroy(hdr); /* * If the buffer is already cached, check whether it has * L2ARC metadata. If not, enter them and update the flag. * This is important is case of onlining a cache device, since * we previously evicted all L2ARC metadata from ARC. */ if (!HDR_HAS_L2HDR(exists)) { arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; exists->b_l2hdr.b_arcs_state = L2BLK_GET_STATE((le)->le_prop); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } mutex_exit(hash_lock); } /* * Starts an asynchronous read IO to read a log block. This is used in log * block reconstruction to start reading the next block before we are done * decoding and reconstructing the current block, to keep the l2arc device * nice and hot with read IO to process. * The returned zio will contain a newly allocated memory buffers for the IO * data which should then be freed by the caller once the zio is no longer * needed (i.e. due to it having completed). If you wish to abort this * zio, you should do so using l2arc_log_blk_fetch_abort, which takes * care of disposing of the allocated buffers correctly. */ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); } /* * Aborts a zio returned from l2arc_log_blk_fetch and frees the data * buffers allocated for it. */ static void l2arc_log_blk_fetch_abort(zio_t *zio) { (void) zio_wait(zio); } /* * Creates a zio to update the device header on an l2arc device. */ void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_free(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); } } /* * Commits a log block to the L2ARC device. This routine is invoked from * l2arc_write_buffers when the log block fills up. * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; uint8_t *tmpbuf; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); tmpbuf = zio_buf_alloc(sizeof (*lb)); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); /* link the buffer into the block chain */ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; /* * l2arc_log_blk_commit() may be called multiple times during a single * l2arc_write_buffers() call. Save the allocated abd buffers in a list * so we can free them in l2arc_write_done() later on. */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(asize <= sizeof (*lb)); /* * Update the start log block pointer in the device header to point * to the log block we're about to write. */ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; l2dhdr->dh_start_lbps[0].lbp_payload_asize = dev->l2ad_log_blk_payload_asize; l2dhdr->dh_start_lbps[0].lbp_payload_start = dev->l2ad_log_blk_payload_start; L2BLK_SET_LSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); L2BLK_SET_PSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); L2BLK_SET_CHECKSUM( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ bzero(tmpbuf + psize, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ bcopy(lb, tmpbuf, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ fletcher_4_native(tmpbuf, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_free(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); dev->l2ad_hand += asize; /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } /* * Validates an L2ARC log block address to make sure that it can be read * from the provided L2ARC device. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; /* * A log block is valid if all of the following conditions are true: * - it fits entirely (including its payload) between l2ad_start and * l2ad_end * - it has a valid size * - neither the log block itself nor part of its payload was evicted * by l2arc_evict(): * * l2ad_hand l2ad_evict * | | lbp_daddr * | start | | end * | | | | | * V V V V V * l2ad_start ============================================ l2ad_end * --------------------------|||| * ^ ^ * | log block * payload */ evicted = l2arc_range_check_overlap(start, end, dev->l2ad_hand) || l2arc_range_check_overlap(start, end, dev->l2ad_evict) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } /* * Inserts ARC buffer header `hdr' into the current L2ARC log block on * the device. The buffer being inserted must be present in L2ARC. * Returns B_TRUE if the L2ARC log block is full and needs to be committed * to L2ARC, or B_FALSE if it still has room for more ARC buffers. */ static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; bzero(le, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; if (index == 0) dev->l2ad_log_blk_payload_start = le->le_daddr; L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* * Checks whether a given L2ARC device address sits in a time-sequential * range. The trick here is that the L2ARC is a rotary buffer, so we can't * just do a range comparison, we need to handle the situation in which the * range wraps around the end of the L2ARC device. Arguments: * bottom -- Lower end of the range to check (written to earlier). * top -- Upper end of the range to check (written to later). * check -- The address for which we want to determine if it sits in * between the top and bottom. * * The 3-way conditional below represents the following cases: * * bottom < top : Sequentially ordered case: * --------+-------------------+ * | (overlap here?) | * L2ARC dev V V * |---------------============--------------| * * bottom > top: Looped-around case: * --------+------------------+ * | (overlap here?) | * L2ARC dev V V * |===============---------------===========| * ^ ^ * | (or here?) | * +---------------+--------- * * top == bottom : Just a single address comparison. */ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) { if (bottom < top) return (bottom <= check && check <= top); else if (bottom > top) return (check <= top || bottom <= check); else return (check == top); } EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, param_get_long, ZMOD_RW, "Min arc size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, param_get_long, ZMOD_RW, "Max arc size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, param_get_long, ZMOD_RW, "Metadata limit for arc size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, "Percent of arc size for arc meta limit"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, param_get_long, ZMOD_RW, "Min arc metadata"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, "Meta objects to scan for prune"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW, "Limit number of restarts in arc_evict_meta"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW, "Meta reclaim strategy"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_int, ZMOD_RW, "Seconds before growing arc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, "Disable arc_p adapt dampener"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim arc to"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed arc buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prescient prefetched block in ms"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW, "Max write bytes per interval"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW, "Extra write bytes during device warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, "Number of max device writes to precache"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, "Compressed l2arc_headroom multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, "Seconds between L2ARC writing"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, "Skip caching prefetched buffers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, "Turbo L2ARC warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, param_get_long, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 1c47430953b1..b29d82fd793e 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -1,2359 +1,2357 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; /* * Tunable to control percentage of dirtied L1 blocks from frees allowed into * one TXG. After this threshold is crossed, additional dirty blocks from frees * will wait until the next TXG. * A value of zero will disable this throttle. */ unsigned long zfs_per_txg_dirty_frees_percent = 5; /* * Enable/disable forcing txg sync when dirty in dmu_offset_next. */ int zfs_dmu_offset_next_sync = 0; /* * Limit the amount we can prefetch with one call to this amount. This * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; static int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * Lookup and hold the bonus buffer for the provided dnode. If the dnode * has not yet been allocated a new bonus dbuf a will be allocated. * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, uint32_t flags) { dmu_buf_impl_t *db; int error; uint32_t db_flags = DB_RF_MUST_SUCCEED; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (zfs_refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); error = dbuf_read(db, NULL, db_flags); if (error) { dnode_evict_bonus(dn); dbuf_rele(db, tag); *dbp = NULL; return (error); } *dbp = &db->db; return (0); } int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else { dbuf_rele(db, tag); *dbp = NULL; } return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; uint32_t db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; zstream_t *zs = NULL; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio = NULL; boolean_t missed = B_FALSE; ASSERT(length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); if (read) zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we * base predictions on the original less racy request order. */ zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read && DNODE_IS_CACHEABLE(dn), B_TRUE); } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { if (zs) dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) zio_nowait(zio); return (SET_ERROR(EIO)); } /* * Initiate async demand data read. * We check the db_state after calling dbuf_read() because * (1) dbuf_read() may change the state to CACHED due to a * hit in the ARC, and (2) on a cache miss, a child will * have been added to "zio" but not yet completed, so the * state will not yet be CACHED. */ if (read) { (void) dbuf_read(db, zio, dbuf_flags); if (db->db_state != DB_CACHED) missed = B_TRUE; } dbp[i] = &db->db; } if (!read) zfs_racct_write(length, nblks); if (zs) dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { /* wait for async read i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } -static int +int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not * in cache, they will be asynchronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * See comment before the definition of dmu_prefetch_max. */ len = MIN(len, dmu_prefetch_max); /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed and l1blks is set to the number of level 1 * indirect blocks found within the chunk. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) { uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = (uint64_t)dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); /* * Check if we can free the entire range assuming that all of the * L1 blocks in this range have data. If we can, we use this * worst case value as an estimate so we can avoid having to look * at the object's actual data. */ uint64_t total_l1blks = (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / iblkrange; if (total_l1blks <= maxblks) { *l1blks = total_l1blks; *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { *l1blks = blks; return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; *l1blks = blks; return (0); } /* * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ /*ARGSUSED*/ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); #endif return (B_FALSE); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (dn == NULL) return (SET_ERROR(EINVAL)); object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 20; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t l1blks; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) return (SET_ERROR(EINTR)); chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&dp->dp_lock); uint64_t long_free_dirty = dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees, wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees. */ if (dirty_frees_threshold != 0 && long_free_dirty >= dirty_frees_threshold) { DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); dmu_tx_commit(tx); txg_wait_open(dp, 0, B_TRUE); continue; } /* * In order to prevent unnecessary write throttling, for each * TXG, we track the cumulative size of L1 blocks being dirtied * in dnode_free_range() below. We compare this number to a * tunable threshold, past which we prevent new L1 dirty freeing * blocks from being added into the open TXG. See * dmu_free_long_range_impl() for details. The threshold * prevents write throttle activation due to dirty freeing L1 * blocks taking up a large percentage of zfs_dirty_data_max. */ mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += l1blks << dn->dn_indblkshift; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty, uint64_t, chunk_len, uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { - if (err == 0) - err = dmu_object_free(os, object, tx); - + err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { uint64_t newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } /* * Note: Lustre is an external consumer of this interface. */ void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { int numbufs, i; dmu_buf_t **dbp; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) dmu_buf_redact(dbp[i], tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset zfs_uio_offset(uio). */ int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX zfs_uiomove could block forever (eg.nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that zfs_uiomove won't * block. */ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset zfs_uio_offset(uio). */ int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * A "lightweight" write is faster than a regular write (e.g. * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the * data can not be read or overwritten until the transaction's txg has been * synced. This makes it appropriate for workloads that are known to be * (temporarily) write-only, like "zfs receive". * * A single block is written, starting at the specified offset in bytes. If * the call is successful, it returns 0 and the provided abd has been * consumed (the caller should not free it). */ int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx) { dbuf_dirty_record_t *dr = dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); if (dr == NULL) return (SET_ERROR(EIO)); dr->dt.dll.dr_abd = abd; dr->dt.dll.dr_props = *zp; dr->dt.dll.dr_flags = flags; return (0); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; uint64_t object = dn->dn_object; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); if (db == NULL) return (SET_ERROR(EIO)); rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned and the arc buf is the * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { zfs_racct_write(blksz, 1); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); } return (0); } int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(dbuf); err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); DB_DNODE_EXIT(dbuf); return (err); } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); BP_SET_FILL(bp, 1); } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; zgd_t *zgd = dsa->dsa_zgd; /* * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ if (zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); VERIFY(BP_EQUAL(bp, db->db_blkptr)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); VERIFY(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; zgd_t *zgd = dsa->dsa_zgd; if (zio->io_error == 0) { /* * Record the vdev(s) backing this blkptr so they can be * flushed after the writes for the lwb have completed. */ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_free(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } /* * In order to prevent the zgd's lwb from being free'd prior to * dmu_sync_late_arrival_done() being called, we have to ensure * the lwb's "max txg" takes this tx's txg into account. */ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; /* * Since we are currently syncing this txg, it's nontrivial to * determine what BP to nopwrite against, so we disable nopwrite. * * When syncing, the db_blkptr is initially the BP of the previous * txg. We can not nopwrite against it because it will be changed * (this is similar to the non-late-arrival case where the dbuf is * dirty in a future txg). * * Then dbuf_write_ready() sets bp_blkptr to the location we will write. * We can not nopwrite against it because although the BP will not * (typically) be changed, the data has not yet been persisted to this * location. * * Finally, when dbuf_write_done() is called, it is theoretically * possible to always nopwrite, because the data that was written in * this txg is the same data that we are trying to write. However we * would need to check that this dbuf is not dirty in any future * txg's (as we do in the normal dmu_sync() path). For simplicity, we * don't nopwrite in this case. */ zp->zp_nopwrite = B_FALSE; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr, *dr_next; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } dr_next = list_next(&db->db_dirty_records, dr); ASSERT(dr_next == NULL || dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* * We need to fill in zgd_bp with the current blkptr so that * the nopwrite code can check if we're writing the same * data that's already on disk. We can only nopwrite if we * are sure that after making the copy, db_blkptr will not * change until our i/o completes. We ensure this by * holding the db_mtx, and only allowing nopwrite if the * block is not already dirty (see below). This is verified * by dmu_sync_done(), which VERIFYs that the db_blkptr has * not changed. */ *zgd->zgd_bp = *db->db_blkptr; } /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_nlevels(dn, nlevels, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (0); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; uint8_t complevel = os->os_complevel; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; boolean_t encrypt = B_FALSE; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); complevel = zio_complevel_select(os->os_spa, compress, complevel, complevel); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checksum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } /* * All objects in an encrypted objset are protected from modification * via a MAC. Encrypted objects store their IV and salt in the last DVA * in the bp, so we cannot use all copies. Encrypted objects are also * not subject to nopwrite since writing the same data will still * result in a new ciphertext. Only encrypted blocks can be dedup'd * to avoid ambiguity in the dedup code since the DDT does not store * object types. */ if (os->os_encrypted && (wp & WP_NOFILL) == 0) { encrypt = B_TRUE; if (DMU_OT_IS_ENCRYPTED(type)) { copies = MIN(copies, SPA_DVAS_PER_BP - 1); nopwrite = B_FALSE; } else { dedup = B_FALSE; } if (level <= 0 && (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) { compress = ZIO_COMPRESS_EMPTY; } } zp->zp_compress = compress; zp->zp_complevel = complevel; zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; bzero(zp->zp_salt, ZIO_DATA_SALT_LEN); bzero(zp->zp_iv, ZIO_DATA_IV_LEN); bzero(zp->zp_mac, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } /* * This function is only called from zfs_holey_common() for zpl_llseek() * in order to determine the location of holes. In order to accurately * report holes all dirty data must be synced to disk. This causes extremely * poor performance when seeking for holes in a dirty file. As a compromise, * only provide hole data when the dnode is clean. When a dnode is dirty * report the dnode as having no holes which is always a safe thing to do. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int i, err; boolean_t clean = B_TRUE; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* * Check if dnode is dirty */ for (i = 0; i < TXG_SIZE; i++) { if (multilist_link_active(&dn->dn_dirty_link[i])) { clean = B_FALSE; break; } } /* * If compatibility option is on, sync any current changes before * we go trundling through the block pointers. */ if (!clean && zfs_dmu_offset_next_sync) { clean = B_TRUE; dnode_rele(dn, FTAG); txg_wait_synced(dmu_objset_pool(os), 0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); } if (clean) err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); else err = SET_ERROR(EBUSY); dnode_rele(dn, FTAG); return (err); } void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); dmu_objset_init(); dnode_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } EXPORT_SYMBOL(dmu_bonus_hold); EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); EXPORT_SYMBOL(dmu_object_info_from_db); EXPORT_SYMBOL(dmu_object_size_from_db); EXPORT_SYMBOL(dmu_object_dnsize_from_db); EXPORT_SYMBOL(dmu_object_set_nlevels); EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_maxblkid); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); EXPORT_SYMBOL(dmu_offset_next); EXPORT_SYMBOL(dmu_write_policy); EXPORT_SYMBOL(dmu_sync); EXPORT_SYMBOL(dmu_request_arcbuf); EXPORT_SYMBOL(dmu_return_arcbuf); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW, "Percentage of dirtied blocks from frees in one TXG"); ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW, "Limit one prefetch call to this size"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index a26b0d739921..043344a1375f 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -1,550 +1,551 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include /* * This tunable disables predictive prefetch. Note that it leaves "prescient" * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, * prescient prefetch never issues i/os that end up not being needed, * so it can't hurt performance. */ int zfs_prefetch_disable = B_FALSE; /* max # of streams per zfetch */ unsigned int zfetch_max_streams = 8; /* min time before stream reclaim */ unsigned int zfetch_min_sec_reap = 2; /* max bytes to prefetch per stream (default 8MB) */ unsigned int zfetch_max_distance = 8 * 1024 * 1024; /* max bytes to prefetch indirects for per stream (default 64MB) */ unsigned int zfetch_max_idistance = 64 * 1024 * 1024; /* max number of bytes in an array_read in which we allow prefetching (1MB) */ unsigned long zfetch_array_rd_sz = 1024 * 1024; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, { "io_issued", KSTAT_DATA_UINT64 }, }; struct { wmsum_t zfetchstat_hits; wmsum_t zfetchstat_misses; wmsum_t zfetchstat_max_streams; wmsum_t zfetchstat_io_issued; } zfetch_sums; #define ZFETCHSTAT_BUMP(stat) \ wmsum_add(&zfetch_sums.stat, 1) #define ZFETCHSTAT_ADD(stat, val) \ wmsum_add(&zfetch_sums.stat, val) kstat_t *zfetch_ksp; static int zfetch_kstats_update(kstat_t *ksp, int rw) { zfetch_stats_t *zs = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); zs->zfetchstat_hits.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_hits); zs->zfetchstat_misses.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_misses); zs->zfetchstat_max_streams.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_max_streams); zs->zfetchstat_io_issued.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_io_issued); return (0); } void zfetch_init(void) { wmsum_init(&zfetch_sums.zfetchstat_hits, 0); wmsum_init(&zfetch_sums.zfetchstat_misses, 0); wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zfetch_ksp != NULL) { zfetch_ksp->ks_data = &zfetch_stats; zfetch_ksp->ks_update = zfetch_kstats_update; kstat_install(zfetch_ksp); } } void zfetch_fini(void) { if (zfetch_ksp != NULL) { kstat_delete(zfetch_ksp); zfetch_ksp = NULL; } wmsum_fini(&zfetch_sums.zfetchstat_hits); wmsum_fini(&zfetch_sums.zfetchstat_misses); wmsum_fini(&zfetch_sums.zfetchstat_max_streams); wmsum_fini(&zfetch_sums.zfetchstat_io_issued); } /* * This takes a pointer to a zfetch structure and a dnode. It performs the * necessary setup for the zfetch structure, grokking data from the * associated dnode. */ void dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { if (zf == NULL) return; zf->zf_dnode = dno; zf->zf_numstreams = 0; list_create(&zf->zf_stream, sizeof (zstream_t), offsetof(zstream_t, zs_node)); mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); } static void dmu_zfetch_stream_fini(zstream_t *zs) { ASSERT(!list_link_active(&zs->zs_node)); zfs_refcount_destroy(&zs->zs_callers); zfs_refcount_destroy(&zs->zs_refs); kmem_free(zs, sizeof (*zs)); } static void dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { ASSERT(MUTEX_HELD(&zf->zf_lock)); list_remove(&zf->zf_stream, zs); zf->zf_numstreams--; membar_producer(); if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); } /* * Clean-up state associated with a zfetch structure (e.g. destroy the * streams). This doesn't free the zfetch_t itself, that's left to the caller. */ void dmu_zfetch_fini(zfetch_t *zf) { zstream_t *zs; mutex_enter(&zf->zf_lock); while ((zs = list_head(&zf->zf_stream)) != NULL) dmu_zfetch_stream_remove(zf, zs); mutex_exit(&zf->zf_lock); list_destroy(&zf->zf_stream); mutex_destroy(&zf->zf_lock); zf->zf_dnode = NULL; } /* * If there aren't too many streams already, create a new stream. * The "blkid" argument is the next block that we expect this stream to access. * While we're here, clean up old streams (which haven't been * accessed for at least zfetch_min_sec_reap seconds). */ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs_next; hrtime_t now = gethrtime(); ASSERT(MUTEX_HELD(&zf->zf_lock)); /* * Clean up old streams. */ for (zstream_t *zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); /* * Skip if still active. 1 -- zf_stream reference. */ if (zfs_refcount_count(&zs->zs_refs) != 1) continue; if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) dmu_zfetch_stream_remove(zf, zs); } /* * The maximum number of streams is normally zfetch_max_streams, * but for small files we lower it such that it's at least possible * for all the streams to be non-overlapping. * * If we are already at the maximum number of streams for this file, * even after removing old streams, then don't create this stream. */ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / zfetch_max_distance)); if (zf->zf_numstreams >= max_streams) { ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); zs->zs_blkid = blkid; zs->zs_pf_blkid1 = blkid; zs->zs_pf_blkid = blkid; zs->zs_ipf_blkid1 = blkid; zs->zs_ipf_blkid = blkid; zs->zs_atime = now; zs->zs_fetch = zf; zs->zs_missed = B_FALSE; zfs_refcount_create(&zs->zs_callers); zfs_refcount_create(&zs->zs_refs); /* One reference for zf_stream. */ zfs_refcount_add(&zs->zs_refs, NULL); zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } static void dmu_zfetch_stream_done(void *arg, boolean_t io_issued) { zstream_t *zs = arg; if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); } /* * This is the predictive prefetch entry point. dmu_zfetch_prepare() * associates dnode access specified with blkid and nblks arguments with * prefetch stream, predicts further accesses based on that stats and returns * the stream pointer on success. That pointer must later be passed to * dmu_zfetch_run() to initiate the speculative prefetch for the stream and * release it. dmu_zfetch() is a wrapper for simple cases when window between * prediction and prefetch initiation is not needed. * fetch_data argument specifies whether actual data blocks should be fetched: * FALSE -- prefetch only indirect blocks for predicted data blocks; * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ zstream_t * dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, boolean_t have_lock) { zstream_t *zs; int64_t pf_start, ipf_start; int64_t pf_ahead_blks, max_blks; int max_dist_blks, pf_nblks, ipf_nblks; uint64_t end_of_access_blkid, maxblkid; end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; if (zfs_prefetch_disable) return (NULL); /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on * concrete vdevs (or previously-loaded indirect vdevs). So we * can't allow the predictive prefetcher to attempt reads of other * blocks (e.g. of the MOS's dnode object). */ if (!spa_indirect_vdevs_loaded(spa)) return (NULL); /* * As a fast path for small (single-block) files, ignore access * to the first block. */ if (!have_lock && blkid == 0) return (NULL); if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); /* * A fast path for small files for which no prefetch will * happen. */ maxblkid = zf->zf_dnode->dn_maxblkid; if (maxblkid < 2) { if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); return (NULL); } mutex_enter(&zf->zf_lock); /* * Find matching prefetch stream. Depending on whether the accesses * are block-aligned, first block of the new access may either follow * the last block of the previous access, or be equal to it. */ for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { if (blkid == zs->zs_blkid) { break; } else if (blkid + 1 == zs->zs_blkid) { blkid++; nblks--; break; } } /* * If the file is ending, remove the matching stream if found. * If not found then it is too late to create a new one now. */ if (end_of_access_blkid >= maxblkid) { if (zs != NULL) dmu_zfetch_stream_remove(zf, zs); mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); return (NULL); } /* Exit if we already prefetched this block before. */ if (nblks == 0) { mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); return (NULL); } if (zs == NULL) { /* * This access is not part of any existing stream. Create * a new stream for it. */ dmu_zfetch_stream_create(zf, end_of_access_blkid); mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_misses); return (NULL); } /* * This access was to a block that we issued a prefetch for on * behalf of this stream. Issue further prefetches for this stream. * * Normally, we start prefetching where we stopped * prefetching last (zs_pf_blkid). But when we get our first * hit on this stream, zs_pf_blkid == zs_blkid, we don't * want to prefetch the block we just accessed. In this case, * start just after the block we just accessed. */ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); if (zs->zs_pf_blkid1 < end_of_access_blkid) zs->zs_pf_blkid1 = end_of_access_blkid; if (zs->zs_ipf_blkid1 < end_of_access_blkid) zs->zs_ipf_blkid1 = end_of_access_blkid; /* * Double our amount of prefetched data, but don't let the * prefetch get further ahead than zfetch_max_distance. */ if (fetch_data) { max_dist_blks = zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; /* * Previously, we were (zs_pf_blkid - blkid) ahead. We * want to now be double that, so read that amount again, * plus the amount we are catching up by (i.e. the amount * read just now). */ pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; max_blks = max_dist_blks - (pf_start - end_of_access_blkid); pf_nblks = MIN(pf_ahead_blks, max_blks); } else { pf_nblks = 0; } zs->zs_pf_blkid = pf_start + pf_nblks; /* * Do the same for indirects, starting from where we stopped last, * or where we will stop reading data blocks (and the indirects * that point to them). */ ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; /* * We want to double our distance ahead of the data prefetch * (or reader, if we are not prefetching data). Previously, we * were (zs_ipf_blkid - blkid) ahead. To double that, we read * that amount again, plus the amount we are catching up by * (i.e. the amount read now + the amount of data prefetched now). */ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid); ipf_nblks = MIN(pf_ahead_blks, max_blks); zs->zs_ipf_blkid = ipf_start + ipf_nblks; zs->zs_blkid = end_of_access_blkid; /* Protect the stream from reclamation. */ zs->zs_atime = gethrtime(); zfs_refcount_add(&zs->zs_refs, NULL); /* Count concurrent callers. */ zfs_refcount_add(&zs->zs_callers, NULL); mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_hits); return (zs); } void dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) { zfetch_t *zf = zs->zs_fetch; int64_t pf_start, pf_end, ipf_start, ipf_end; int epbs, issued; if (missed) zs->zs_missed = missed; /* * Postpone the prefetch if there are more concurrent callers. * It happens when multiple requests are waiting for the same * indirect block. The last one will run the prefetch for all. */ if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { /* Drop reference taken in dmu_zfetch_prepare(). */ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); return; } mutex_enter(&zf->zf_lock); if (zs->zs_missed) { pf_start = zs->zs_pf_blkid1; pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid; } else { pf_start = pf_end = 0; } ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1); ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid; mutex_exit(&zf->zf_lock); ASSERT3S(pf_start, <=, pf_end); ASSERT3S(ipf_start, <=, ipf_end); epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; ASSERT3S(ipf_start, <=, ipf_end); issued = pf_end - pf_start + ipf_end - ipf_start; if (issued > 1) { /* More references on top of taken in dmu_zfetch_prepare(). */ - zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL); + for (int i = 0; i < issued - 1; i++) + zfs_refcount_add(&zs->zs_refs, NULL); } else if (issued == 0) { /* Some other thread has done our work, so drop the ref. */ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); return; } if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); issued = 0; for (int64_t blk = pf_start; blk < pf_end; blk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, dmu_zfetch_stream_done, zs); } for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, dmu_zfetch_stream_done, zs); } if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); if (issued) ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); } void dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, boolean_t missed, boolean_t have_lock) { zstream_t *zs; zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); if (zs) dmu_zfetch_run(zs, missed, have_lock); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, "Disable all ZFS prefetching"); ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, "Max number of streams per zfetch"); ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, "Min time before stream reclaim"); ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, "Max bytes to prefetch per stream"); ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, "Max bytes to prefetch indirects for per stream"); ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, "Number of bytes in a array_read"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 7f741542ce02..900240479c70 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -1,2580 +1,2571 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include dnode_stats_t dnode_stats = { { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, { "dnode_allocate", KSTAT_DATA_UINT64 }, { "dnode_reallocate", KSTAT_DATA_UINT64 }, { "dnode_buf_evict", KSTAT_DATA_UINT64 }, { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, { "dnode_alloc_race", KSTAT_DATA_UINT64 }, { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, { "dnode_move_invalid", KSTAT_DATA_UINT64 }, { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, { "dnode_move_special", KSTAT_DATA_UINT64 }, { "dnode_move_handle", KSTAT_DATA_UINT64 }, { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, { "dnode_move_active", KSTAT_DATA_UINT64 }, }; static kstat_t *dnode_ksp; static kmem_cache_t *dnode_cache; static dnode_phys_t dnode_phys_zero __maybe_unused; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; #ifdef _KERNEL static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ static int dbuf_compare(const void *x1, const void *x2) { const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; int cmp = TREE_CMP(d1->db_level, d2->db_level); if (likely(cmp)) return (cmp); cmp = TREE_CMP(d1->db_blkid, d2->db_blkid); if (likely(cmp)) return (cmp); if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); return (-1); } else if (d2->db_state == DB_SEARCH) { ASSERT3S(d1->db_state, !=, DB_SEARCH); return (1); } return (TREE_PCMP(d1, d2)); } /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { dnode_t *dn = arg; int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL); /* * Every dbuf has a reference, and dropping a tracked reference is * O(number of references), so don't track dn_holds. */ zfs_refcount_create_untracked(&dn->dn_holds); zfs_refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); bzero(&dn->dn_next_type[0], sizeof (dn->dn_next_type)); bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid)); for (i = 0; i < TXG_SIZE; i++) { multilist_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirty_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_oldprojid = ZFS_DEFAULT_PROJID; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_newprojid = ZFS_DEFAULT_PROJID; dn->dn_id_flags = 0; dn->dn_dbufs_count = 0; avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; return (0); } /* ARGSUSED */ static void dnode_dest(void *arg, void *unused) { int i; dnode_t *dn = arg; rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); cv_destroy(&dn->dn_nodnholds); zfs_refcount_destroy(&dn->dn_holds); zfs_refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_blksz[i]); ASSERT0(dn->dn_next_maxblkid[i]); } ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); ASSERT0(dn->dn_dirty_txg); ASSERT0(dn->dn_dirtyctx); ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ASSERT3P(dn->dn_bonus, ==, NULL); ASSERT(!dn->dn_have_spill); ASSERT3P(dn->dn_zio, ==, NULL); ASSERT0(dn->dn_oldused); ASSERT0(dn->dn_oldflags); ASSERT0(dn->dn_olduid); ASSERT0(dn->dn_oldgid); ASSERT0(dn->dn_oldprojid); ASSERT0(dn->dn_newuid); ASSERT0(dn->dn_newgid); ASSERT0(dn->dn_newprojid); ASSERT0(dn->dn_id_flags); ASSERT0(dn->dn_dbufs_count); avl_destroy(&dn->dn_dbufs); } void dnode_init(void) { ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); kmem_cache_set_move(dnode_cache, dnode_move); dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dnode_ksp != NULL) { dnode_ksp->ks_data = &dnode_stats; kstat_install(dnode_ksp); } } void dnode_fini(void) { if (dnode_ksp != NULL) { kstat_delete(dnode_ksp); dnode_ksp = NULL; } kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } #ifdef ZFS_DEBUG void dnode_verify(dnode_t *dn) { int drop_struct_lock = FALSE; ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { int i; int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); if (dn->dn_datablkshift) { ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); ASSERT3U(dn->dn_datablksz, ==, dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + dn->dn_bonuslen, <=, max_bonuslen); for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); } } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } #endif void dnode_byteswap(dnode_phys_t *dnp) { uint64_t *buf64 = (void*)&dnp->dn_blkptr; int i; if (dnp->dn_type == DMU_OT_NONE) { bzero(dnp, sizeof (dnode_phys_t)); return; } dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); dnp->dn_used = BSWAP_64(dnp->dn_used); /* * dn_nblkptr is only one byte, so it's OK to read it in either * byte order. We can't read dn_bouslen. */ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) buf64[i] = BSWAP_64(buf64[i]); /* * OK to check dn_bonuslen for zero, because it won't matter if * we have the wrong byte order. This is necessary because the * dnode dnode is smaller than a regular dnode. */ if (dnp->dn_bonuslen != 0) { /* * Note that the bonus length calculated here may be * longer than the actual bonus buffer. This is because * we always put the bonus buffer after the last block * pointer (instead of packing it against the end of the * dnode buffer). */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); int slots = dnp->dn_extra_slots + 1; size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; dmu_object_byteswap_t byteswap; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } void dnode_buf_byteswap(void *vbuf, size_t size) { int i = 0; ASSERT3U(sizeof (dnode_phys_t), ==, (1<dn_type != DMU_OT_NONE) i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) { ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); if (newsize < dn->dn_bonuslen) { /* clear any data after the end of the new size */ size_t diff = dn->dn_bonuslen - newsize; char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize; bzero(data_end, diff); } dn->dn_bonuslen = newsize; if (newsize == 0) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; else dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; rw_exit(&dn->dn_struct_rwlock); } void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) { ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dn->dn_bonustype = newtype; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; rw_exit(&dn->dn_struct_rwlock); } void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) { ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); dnode_setdirty(dn, tx); dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK; dn->dn_have_spill = B_FALSE; } static void dnode_setdblksz(dnode_t *dn, int size) { ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, >=, SPA_MINBLOCKSIZE); ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); dn->dn_datablksz = size; dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; } static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); dn->dn_moved = 0; /* * Defer setting dn_objset until the dnode is ready to be a candidate * for the dnode_move() callback. */ dn->dn_object = object; dn->dn_dbuf = db; dn->dn_handle = dnh; dn->dn_phys = dnp; if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } else { dn->dn_datablksz = 0; dn->dn_datablkszsec = 0; dn->dn_datablkshift = 0; } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; dn->dn_nblkptr = dnp->dn_nblkptr; dn->dn_checksum = dnp->dn_checksum; dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_num_slots = dnp->dn_extra_slots + 1; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); mutex_enter(&os->os_lock); /* * Exclude special dnodes from os_dnodes so an empty os_dnodes * signifies that the special dnodes have no references from * their children (the entries in os_dnodes). This allows * dnode_destroy() to easily determine if the last child has * been removed and then complete eviction of the objset. */ if (!DMU_OBJECT_IS_SPECIAL(object)) list_insert_head(&os->os_dnodes, dn); membar_producer(); /* * Everything else must be valid before assigning dn_objset * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); return (dn); } /* * Caller must be holding the dnode handle, which is released upon return. */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { list_remove(&os->os_dnodes, dn); complete_os_eviction = list_is_empty(&os->os_dnodes) && list_link_active(&os->os_evicting_node); } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirty_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; dn->dn_have_spill = B_FALSE; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_oldprojid = ZFS_DEFAULT_PROJID; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_newprojid = ZFS_DEFAULT_PROJID; dn->dn_id_flags = 0; dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); if (complete_os_eviction) dmu_objset_evict_done(os); } void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) { int i; ASSERT3U(dn_slots, >, 0); ASSERT3U(dn_slots << DNODE_SHIFT, <=, spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) blocksize = 1 << zfs_default_bs; else blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); if (ibs == 0) ibs = zfs_default_ibs; ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", dn->dn_objset, (u_longlong_t)dn->dn_object, (u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots); DNODE_STAT_BUMP(dnode_allocate); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); ASSERT0(dn->dn_next_maxblkid[i]); ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } dn->dn_type = ot; dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; dn->dn_num_slots = dn_slots; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ dn->dn_nblkptr = 1; else { dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> SPA_BLKPTRSHIFT)); } dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_dirty_txg = 0; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dn_slots, boolean_t keep_spill, dmu_tx_t *tx) { int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); ASSERT0(blocksize % SPA_MINBLOCKSIZE); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT)); dnode_free_interior_slots(dn); DNODE_STAT_BUMP(dnode_reallocate); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); dn->dn_id_flags = 0; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_setdirty(dn, tx); if (dn->dn_datablksz != blocksize) { /* change blocksize */ ASSERT0(dn->dn_maxblkid); ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || dnode_block_freed(dn, 0)); dnode_setdblksz(dn, blocksize); dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize; } if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ nblkptr = 1; else nblkptr = MIN(DN_MAX_NBLKPTR, 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> SPA_BLKPTRSHIFT)); if (dn->dn_bonustype != bonustype) dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr; if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } rw_exit(&dn->dn_struct_rwlock); /* change type */ dn->dn_type = ot; /* change bonus size and type */ mutex_enter(&dn->dn_mtx); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_num_slots = dn_slots; dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); /* fix up the bonus db_size */ if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); } #ifdef _KERNEL static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) { int i; ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); /* Copy fields. */ ndn->dn_objset = odn->dn_objset; ndn->dn_object = odn->dn_object; ndn->dn_dbuf = odn->dn_dbuf; ndn->dn_handle = odn->dn_handle; ndn->dn_phys = odn->dn_phys; ndn->dn_type = odn->dn_type; ndn->dn_bonuslen = odn->dn_bonuslen; ndn->dn_bonustype = odn->dn_bonustype; ndn->dn_nblkptr = odn->dn_nblkptr; ndn->dn_checksum = odn->dn_checksum; ndn->dn_compress = odn->dn_compress; ndn->dn_nlevels = odn->dn_nlevels; ndn->dn_indblkshift = odn->dn_indblkshift; ndn->dn_datablkshift = odn->dn_datablkshift; ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; ndn->dn_num_slots = odn->dn_num_slots; bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], sizeof (odn->dn_next_type)); bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], sizeof (odn->dn_next_nblkptr)); bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], sizeof (odn->dn_next_nlevels)); bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], sizeof (odn->dn_next_indblkshift)); bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], sizeof (odn->dn_next_bonustype)); bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], sizeof (odn->dn_rm_spillblk)); bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], sizeof (odn->dn_next_bonuslen)); bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], sizeof (odn->dn_next_blksz)); bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0], sizeof (odn->dn_next_maxblkid)); for (i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); } bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], sizeof (odn->dn_free_ranges)); ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; ndn->dn_dirty_txg = odn->dn_dirty_txg; ndn->dn_dirtyctx = odn->dn_dirtyctx; ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0); zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_bonus = odn->dn_bonus; ndn->dn_have_spill = odn->dn_have_spill; ndn->dn_zio = odn->dn_zio; ndn->dn_oldused = odn->dn_oldused; ndn->dn_oldflags = odn->dn_oldflags; ndn->dn_olduid = odn->dn_olduid; ndn->dn_oldgid = odn->dn_oldgid; ndn->dn_oldprojid = odn->dn_oldprojid; ndn->dn_newuid = odn->dn_newuid; ndn->dn_newgid = odn->dn_newgid; ndn->dn_newprojid = odn->dn_newprojid; ndn->dn_id_flags = odn->dn_id_flags; dmu_zfetch_init(&ndn->dn_zfetch, ndn); /* * Update back pointers. Updating the handle fixes the back pointer of * every descendant dbuf as well as the bonus dbuf. */ ASSERT(ndn->dn_handle->dnh_dnode == odn); ndn->dn_handle->dnh_dnode = ndn; /* * Invalidate the original dnode by clearing all of its back pointers. */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_bonus = NULL; dmu_zfetch_fini(&odn->dn_zfetch); /* * Set the low bit of the objset pointer to ensure that dnode_move() * recognizes the dnode as invalid in any subsequent callback. */ POINTER_INVALIDATE(&odn->dn_objset); /* * Satisfy the destructor. */ for (i = 0; i < TXG_SIZE; i++) { list_create(&odn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); odn->dn_free_ranges[i] = NULL; odn->dn_next_nlevels[i] = 0; odn->dn_next_indblkshift[i] = 0; odn->dn_next_bonustype[i] = 0; odn->dn_rm_spillblk[i] = 0; odn->dn_next_bonuslen[i] = 0; odn->dn_next_blksz[i] = 0; } odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; odn->dn_dirty_txg = 0; odn->dn_dirtyctx = 0; odn->dn_dirtyctx_firstset = NULL; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; odn->dn_oldflags = 0; odn->dn_olduid = 0; odn->dn_oldgid = 0; odn->dn_oldprojid = ZFS_DEFAULT_PROJID; odn->dn_newuid = 0; odn->dn_newgid = 0; odn->dn_newprojid = ZFS_DEFAULT_PROJID; odn->dn_id_flags = 0; /* * Mark the dnode. */ ndn->dn_moved = 1; odn->dn_moved = (uint8_t)-1; } /*ARGSUSED*/ static kmem_cbrc_t dnode_move(void *buf, void *newbuf, size_t size, void *arg) { dnode_t *odn = buf, *ndn = newbuf; objset_t *os; int64_t refcount; uint32_t dbufs; /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when * freeing the dnode to invalidate it, and the memory patterns written * by kmem (baddcafe and deadbeef) set at least one of the two low bits. * A newly created dnode sets the objset pointer last of all to indicate * that the dnode is known and in a valid state to be moved by this * function. */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { DNODE_STAT_BUMP(dnode_move_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Ensure that the objset does not go away during the move. */ rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); DNODE_STAT_BUMP(dnode_move_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the dnode is still valid, then so is the objset. We know that no * valid objset can be freed while we hold os_lock, so we can safely * ensure that the objset remains in use. */ mutex_enter(&os->os_lock); /* * Recheck the objset pointer in case the dnode was removed just before * acquiring the lock. */ if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); DNODE_STAT_BUMP(dnode_move_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold os->os_lock, the dnode * cannot be freed and fields within the dnode can be safely accessed. * The objset listing this dnode cannot go away as long as this dnode is * on its list. */ rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); DNODE_STAT_BUMP(dnode_move_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ /* * Lock the dnode handle to prevent the dnode from obtaining any new * holds. This also prevents the descendant dbufs and the bonus dbuf * from accessing the dnode, so that we can discount their holds. The * handle is safe to access because we know that while the dnode cannot * go away, neither can its handle. Once we hold dnh_zrlock, we can * safely move any dnode referenced only by dbufs. */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); DNODE_STAT_BUMP(dnode_move_handle); return (KMEM_CBRC_LATER); } /* * Ensure a consistent view of the dnode's holds and the dnode's dbufs. * We need to guarantee that there is a hold for every dbuf in order to * determine whether the dnode is actively referenced. Falsely matching * a dbuf to an active hold would lead to an unsafe move. It's possible * that a thread already having an active dnode hold is about to add a * dbuf, and we can't compare hold and dbuf counts while the add is in * progress. */ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_BUMP(dnode_move_rwlock); return (KMEM_CBRC_LATER); } /* * A dbuf may be removed (evicted) without an active dnode hold. In that * case, the dbuf count is decremented under the handle lock before the * dbuf's hold is released. This order ensures that if we count the hold * after the dbuf is removed but before its hold is released, we will * treat the unmatched hold as active and exit safely. If we count the * hold before the dbuf is removed, the hold is discounted, and the * removal is blocked until the move completes. */ refcount = zfs_refcount_count(&odn->dn_holds); ASSERT(refcount >= 0); dbufs = DN_DBUFS_COUNT(odn); /* We can't have more dbufs than dnode holds. */ ASSERT3U(dbufs, <=, refcount); DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, uint32_t, dbufs); if (refcount > dbufs) { rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_BUMP(dnode_move_active); return (KMEM_CBRC_LATER); } rw_exit(&odn->dn_struct_rwlock); /* * At this point we know that anyone with a hold on the dnode is not * actively referencing it. The dnode is known and in a valid state to * move. We're holding the locks needed to execute the critical section. */ dnode_move_impl(odn, ndn); list_link_replace(&odn->dn_link, &ndn->dn_link); /* If the dnode was safe to move, the refcount cannot have changed. */ ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds)); ASSERT(dbufs == DN_DBUFS_COUNT(ndn)); zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ mutex_exit(&os->os_lock); return (KMEM_CBRC_YES); } #endif /* _KERNEL */ static void dnode_slots_hold(dnode_children_t *children, int idx, int slots) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; zrl_add(&dnh->dnh_zrlock); } } static void dnode_slots_rele(dnode_children_t *children, int idx, int slots) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; if (zrl_is_locked(&dnh->dnh_zrlock)) zrl_exit(&dnh->dnh_zrlock); else zrl_remove(&dnh->dnh_zrlock); } } static int dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; if (!zrl_tryenter(&dnh->dnh_zrlock)) { for (int j = idx; j < i; j++) { dnh = &children->dnc_children[j]; zrl_exit(&dnh->dnh_zrlock); } return (0); } } return (1); } static void dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; dnh->dnh_dnode = ptr; } } static boolean_t dnode_check_slots_free(dnode_children_t *children, int idx, int slots) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); /* * If all dnode slots are either already free or * evictable return B_TRUE. */ for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; dnode_t *dn = dnh->dnh_dnode; if (dn == DN_SLOT_FREE) { continue; } else if (DN_SLOT_IS_PTR(dn)) { mutex_enter(&dn->dn_mtx); boolean_t can_free = (dn->dn_type == DMU_OT_NONE && zfs_refcount_is_zero(&dn->dn_holds) && !DNODE_IS_DIRTY(dn)); mutex_exit(&dn->dn_mtx); if (!can_free) return (B_FALSE); else continue; } else { return (B_FALSE); } } return (B_TRUE); } static void dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) { ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { dnode_handle_t *dnh = &children->dnc_children[i]; ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); dnode_destroy(dnh->dnh_dnode); dnh->dnh_dnode = DN_SLOT_FREE; } } } void dnode_free_interior_slots(dnode_t *dn) { dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; int idx = (dn->dn_object & (epb - 1)) + 1; int slots = dn->dn_num_slots - 1; if (slots == 0) return; ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); while (!dnode_slots_tryenter(children, idx, slots)) { DNODE_STAT_BUMP(dnode_free_interior_lock_retry); cond_resched(); } dnode_set_slots(children, idx, slots, DN_SLOT_FREE); dnode_slots_rele(children, idx, slots); } void dnode_special_close(dnode_handle_t *dnh) { dnode_t *dn = dnh->dnh_dnode; /* * Ensure dnode_rele_and_unlock() has released dn_mtx, after final * zfs_refcount_remove() */ mutex_enter(&dn->dn_mtx); if (zfs_refcount_count(&dn->dn_holds) > 0) cv_wait(&dn->dn_nodnholds, &dn->dn_mtx); mutex_exit(&dn->dn_mtx); ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0); ASSERT(dn->dn_dbuf == NULL || dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; zrl_init(&dnh->dnh_zrlock); VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock)); dn = dnode_create(os, dnp, NULL, object, dnh); DNODE_VERIFY(dn); zrl_exit(&dnh->dnh_zrlock); } static void dnode_buf_evict_async(void *dbu) { dnode_children_t *dnc = dbu; DNODE_STAT_BUMP(dnode_buf_evict); for (int i = 0; i < dnc->dnc_count; i++) { dnode_handle_t *dnh = &dnc->dnc_children[i]; dnode_t *dn; /* * The dnode handle lock guards against the dnode moving to * another valid address, so there is no need here to guard * against changes to or from NULL. */ if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = DN_SLOT_UNINIT; continue; } zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(zfs_refcount_is_zero(&dn->dn_holds)); ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); dnode_destroy(dn); /* implicit zrl_remove() for first slot */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = DN_SLOT_UNINIT; } kmem_free(dnc, sizeof (dnode_children_t) + dnc->dnc_count * sizeof (dnode_handle_t)); } /* * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used * to ensure the hole at the specified object offset is large enough to * hold the dnode being created. The slots parameter is also used to ensure * a dnode does not span multiple dnode blocks. In both of these cases, if * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases * are only possible when using DNODE_MUST_BE_FREE. * * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. * dnode_hold_impl() will check if the requested dnode is already consumed * as an extra dnode slot by an large dnode, in which case it returns * ENOENT. * * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just * return whether the hold would succeed or not. tag and dnp should set to * NULL in this case. * * errors: * EINVAL - Invalid object number or flags. * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) * - Refers to a freeing dnode (DNODE_MUST_BE_FREE) * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED) * EIO - I/O error when reading the meta dnode dbuf. * * succeeds even for free dnodes. */ int dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, void *tag, dnode_t **dnp) { int epb, idx, err; int drop_struct_lock = FALSE; int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; dnode_children_t *dnc; dnode_phys_t *dn_block; dnode_handle_t *dnh; ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL)); /* * If you are holding the spa config lock as writer, you shouldn't * be asking the DMU to do *anything* unless it's the root pool * which may require us to read from the root filesystem while * holding some (not all) of the locks as writer. */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || (spa_is_root(os->os_spa) && spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE)); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT || object == DMU_PROJECTUSED_OBJECT) { if (object == DMU_USERUSED_OBJECT) dn = DMU_USERUSED_DNODE(os); else if (object == DMU_GROUPUSED_OBJECT) dn = DMU_GROUPUSED_DNODE(os); else dn = DMU_PROJECTUSED_DNODE(os); if (dn == NULL) return (SET_ERROR(ENOENT)); type = dn->dn_type; if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) return (SET_ERROR(ENOENT)); if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); /* Don't actually hold if dry run, just return 0 */ if (!(flag & DNODE_DRY_RUN)) { (void) zfs_refcount_add(&dn->dn_holds, tag); *dnp = dn; } return (0); } if (object == 0 || object >= DN_MAX_OBJECT) return (SET_ERROR(EINVAL)); mdn = DMU_META_DNODE(os); ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { rw_enter(&mdn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); if (db == NULL) { DNODE_STAT_BUMP(dnode_hold_dbuf_hold); return (SET_ERROR(EIO)); } /* * We do not need to decrypt to read the dnode so it doesn't matter * if we get the encrypted or decrypted version. */ err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); if (err) { DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); return (err); } ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; idx = object & (epb - 1); dn_block = (dnode_phys_t *)db->db.db_data; ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); dnc = dmu_buf_get_user(&db->db); dnh = NULL; if (dnc == NULL) { dnode_children_t *winner; int skip = 0; dnc = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); dnc->dnc_count = epb; dnh = &dnc->dnc_children[0]; /* Initialize dnode slot status from dnode_phys_t */ for (int i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); if (skip) { skip--; continue; } if (dn_block[i].dn_type != DMU_OT_NONE) { int interior = dn_block[i].dn_extra_slots; dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); dnode_set_slots(dnc, i + 1, interior, DN_SLOT_INTERIOR); skip = interior; } else { dnh[i].dnh_dnode = DN_SLOT_FREE; skip = 0; } } dmu_buf_init_user(&dnc->dnc_dbu, NULL, dnode_buf_evict_async, NULL); winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); if (winner != NULL) { for (int i = 0; i < epb; i++) zrl_destroy(&dnh[i].dnh_zrlock); kmem_free(dnc, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); dnc = winner; } } ASSERT(dnc->dnc_count == epb); if (flag & DNODE_MUST_BE_ALLOCATED) { slots = 1; dnode_slots_hold(dnc, idx, slots); dnh = &dnc->dnc_children[idx]; if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { dn = dnh->dnh_dnode; } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { DNODE_STAT_BUMP(dnode_hold_alloc_interior); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(EEXIST)); } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { DNODE_STAT_BUMP(dnode_hold_alloc_misses); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(ENOENT)); } else { dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); cond_resched(); } /* * Someone else won the race and called dnode_create() * after we checked DN_SLOT_IS_PTR() above but before * we acquired the lock. */ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); dn = dnh->dnh_dnode; } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); } } mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { DNODE_STAT_BUMP(dnode_hold_alloc_type_none); mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(ENOENT)); } /* Don't actually hold if dry run, just return 0 */ if (flag & DNODE_DRY_RUN) { mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (0); } DNODE_STAT_BUMP(dnode_hold_alloc_hits); } else if (flag & DNODE_MUST_BE_FREE) { if (idx + slots - 1 >= DNODES_PER_BLOCK) { DNODE_STAT_BUMP(dnode_hold_free_overflow); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } dnode_slots_hold(dnc, idx, slots); if (!dnode_check_slots_free(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_misses); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_retry); cond_resched(); } if (!dnode_check_slots_free(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_misses); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } /* * Allocated but otherwise free dnodes which would * be in the interior of a multi-slot dnodes need * to be freed. Single slot dnodes can be safely * re-purposed as a performance optimization. */ if (slots > 1) dnode_reclaim_slots(dnc, idx + 1, slots - 1); dnh = &dnc->dnc_children[idx]; if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { dn = dnh->dnh_dnode; } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); } mutex_enter(&dn->dn_mtx); if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) { DNODE_STAT_BUMP(dnode_hold_free_refcount); mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(EEXIST)); } /* Don't actually hold if dry run, just return 0 */ if (flag & DNODE_DRY_RUN) { mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (0); } dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); DNODE_STAT_BUMP(dnode_hold_free_hits); } else { dbuf_rele(db, FTAG); return (SET_ERROR(EINVAL)); } ASSERT0(dn->dn_free_txg); if (zfs_refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ dnode_slots_rele(dnc, idx, slots); DNODE_VERIFY(dn); ASSERT3P(dnp, !=, NULL); ASSERT3P(dn->dn_dbuf, ==, db); ASSERT3U(dn->dn_object, ==, object); dbuf_rele(db, FTAG); *dnp = dn; return (0); } /* * Return held dnode if the object is allocated, NULL if not. */ int dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, dnp)); } /* * Can only add a reference if there is already at least one * reference on the dnode. Returns FALSE if unable to add a * new reference. */ boolean_t dnode_add_ref(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); if (zfs_refcount_is_zero(&dn->dn_holds)) { mutex_exit(&dn->dn_mtx); return (FALSE); } VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); mutex_exit(&dn->dn_mtx); return (TRUE); } void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, tag, B_FALSE); } void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ dmu_buf_impl_t *db = dn->dn_dbuf; dnode_handle_t *dnh = dn->dn_handle; refs = zfs_refcount_remove(&dn->dn_holds, tag); if (refs == 0) cv_broadcast(&dn->dn_nodnholds); mutex_exit(&dn->dn_mtx); /* dnode could get destroyed at this point, so don't use it anymore */ /* * It's unsafe to release the last hold on a dnode by dnode_rele() or * indirectly by dbuf_rele() while relying on the dnode handle to * prevent the dnode from moving, since releasing the last hold could * result in the dnode's parent dbuf evicting its dnode handles. For * that reason anyone calling dnode_rele() or dbuf_rele() without some * other direct or indirect hold on the dnode must first drop the dnode * handle. */ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && db != NULL) { /* * Another thread could add a hold to the dnode handle in * dnode_hold_impl() while holding the parent dbuf. Since the * hold on the parent dbuf prevents the handle from being * destroyed, the hold on the handle is OK. We can't yet assert * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, dnh, evicting); } } /* * Test whether we can create a dnode at the specified location. */ int dnode_try_claim(objset_t *os, uint64_t object, int slots) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN, slots, NULL, NULL)); } void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { dsl_dataset_dirty(os->os_dsl_dataset, tx); return; } DNODE_VERIFY(dn); #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif /* * Determine old uid/gid when necessary */ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK]; multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); /* * If we are already marked dirty, we're done. */ if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { multilist_sublist_unlock(mls); return; } ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) || !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]); ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", (u_longlong_t)dn->dn_object, (u_longlong_t)txg); multilist_sublist_insert_head(mls, dn); multilist_sublist_unlock(mls); /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its * children. */ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } void dnode_free(dnode_t *dn, dmu_tx_t *tx) { mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { mutex_exit(&dn->dn_mtx); return; } dn->dn_free_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); } /* * Try to change the block size for the indicated dnode. This can only * succeed if there are no blocks allocated or dirty beyond first block */ int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db; int err; ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (size == 0) size = SPA_MINBLOCKSIZE; else size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); if (ibs == dn->dn_indblkshift) ibs = 0; if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* Check for any allocated blocks beyond the first */ if (dn->dn_maxblkid != 0) goto fail; mutex_enter(&dn->dn_dbufs_mtx); for (db = avl_first(&dn->dn_dbufs); db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } } mutex_exit(&dn->dn_dbufs_mtx); if (ibs && dn->dn_nlevels != 1) goto fail; /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) { dbuf_new_size(db, size, tx); } else if (err != ENOENT) { goto fail; } dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; if (ibs) { dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } /* release after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); fail: rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(ENOTSUP)); } static void dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) { uint64_t txgoff = tx->tx_txg & TXG_MASK; int old_nlevels = dn->dn_nlevels; dmu_buf_impl_t *db; list_t *list; dbuf_dirty_record_t *new, *dr, *dr_next; ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT3U(new_nlevels, >, dn->dn_nlevels); dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); dn->dn_next_nlevels[txgoff] = new_nlevels; /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ mutex_enter(&dn->dn_mtx); mutex_enter(&new->dt.di.dr_mtx); list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1); if (dr->dr_dbuf == NULL || (dr->dr_dbuf->db_level == old_nlevels - 1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) { list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; } } mutex_exit(&new->dt.di.dr_mtx); mutex_exit(&dn->dn_mtx); } int dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx) { int ret = 0; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_nlevels == nlevels) { ret = 0; goto out; } else if (nlevels < dn->dn_nlevels) { ret = SET_ERROR(EINVAL); goto out; } dnode_set_nlevels_impl(dn, nlevels, tx); out: rw_exit(&dn->dn_struct_rwlock); return (ret); } /* read-holding callers must not rely on the lock being continuously held */ void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read, boolean_t force) { int epbs, new_nlevels; uint64_t sz; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * if we have a read-lock, check to see if we need to do any work * before upgrading to a write-lock. */ if (have_read) { if (blkid <= dn->dn_maxblkid) return; if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } } /* * Raw sends (indicated by the force flag) require that we take the * given blkid even if the value is lower than the current value. */ if (!force && blkid <= dn->dn_maxblkid) goto out; /* * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff] * to indicate that this field is set. This allows us to set the * maxblkid to 0 on an existing object in dnode_sync(). */ dn->dn_maxblkid = blkid; dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = blkid | DMU_NEXT_MAXBLKID_SET; /* * Compute the number of levels necessary to support the new maxblkid. * Raw sends will ensure nlevels is set correctly for us. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) new_nlevels++; ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS); if (!force) { if (new_nlevels > dn->dn_nlevels) dnode_set_nlevels_impl(dn, new_nlevels, tx); } else { ASSERT3U(dn->dn_nlevels, >=, new_nlevels); } out: if (have_read) rw_downgrade(&dn->dn_struct_rwlock); } static void dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); if (db != NULL) { dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } } /* * Dirty all the in-core level-1 dbufs in the range specified by start_blkid * and end_blkid. */ static void dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db_search; dmu_buf_impl_t *db; avl_index_t where; db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP); mutex_enter(&dn->dn_dbufs_mtx); db_search->db_level = 1; db_search->db_blkid = start_blkid + 1; db_search->db_state = DB_SEARCH; for (;;) { db = avl_find(&dn->dn_dbufs, db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); if (db == NULL || db->db_level != 1 || db->db_blkid >= end_blkid) { break; } /* * Setup the next blkid we want to search for. */ db_search->db_blkid = db->db_blkid + 1; ASSERT3U(db->db_blkid, >=, start_blkid); /* * If the dbuf transitions to DB_EVICTING while we're trying * to dirty it, then we will be unable to discover it in * the dbuf hash table. This will result in a call to * dbuf_create() which needs to acquire the dn_dbufs_mtx * lock. To avoid a deadlock, we drop the lock before * dirtying the level-1 dbuf. */ mutex_exit(&dn->dn_dbufs_mtx); dnode_dirty_l1(dn, db->db_blkid, tx); mutex_enter(&dn->dn_dbufs_mtx); } #ifdef ZFS_DEBUG /* * Walk all the in-core level-1 dbufs and verify they have been dirtied. */ db_search->db_level = 1; db_search->db_blkid = start_blkid + 1; db_search->db_state = DB_SEARCH; db = avl_find(&dn->dn_dbufs, db_search, &where); if (db == NULL) db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_level != 1 || db->db_blkid >= end_blkid) break; if (db->db_state != DB_EVICTING) ASSERT(db->db_dirtycnt > 0); } #endif kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); } void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag) { /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; if (ds != NULL) { rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag); } if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { if (dmu_tx_is_syncing(tx)) dn->dn_dirtyctx = DN_DIRTY_SYNC; else dn->dn_dirtyctx = DN_DIRTY_OPEN; dn->dn_dirtyctx_firstset = tag; } if (ds != NULL) { rrw_exit(&ds->ds_bp_rwlock, tag); } } } +static void +dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int res; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE, + FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { + db_lock_type_t dblt; + boolean_t dirty; + + dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + /* don't dirty if not on disk and not dirty */ + dirty = !list_is_empty(&db->db_dirty_records) || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, dblt, FTAG); + if (dirty) { + caddr_t data; + + dmu_buf_will_dirty(&db->db, tx); + data = db->db.db_data; + bzero(data + blkoff, len); + } + dbuf_rele(db, FTAG); + } +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { - dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; int blksz, blkshift, head, tail; int trunc = FALSE; int epbs; blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; if (len == DMU_OBJECT_END) { len = UINT64_MAX - off; trunc = TRUE; } /* * First, block align the region to free: */ if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) return; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. */ blkid = 0; nblks = 1; if (dn->dn_nlevels > 1) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_dirty_l1(dn, 0, tx); rw_exit(&dn->dn_struct_rwlock); } goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ return; } else { /* Freeing part of the block. */ head = blksz - off; ASSERT3U(head, >, 0); } blkoff = off; } /* zero out any partial block data at the start of the range */ if (head) { - int res; ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), - TRUE, FALSE, FTAG, &db); - rw_exit(&dn->dn_struct_rwlock); - if (res == 0) { - caddr_t data; - boolean_t dirty; - - db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, - FTAG); - /* don't dirty if it isn't on disk and isn't dirty */ - dirty = !list_is_empty(&db->db_dirty_records) || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); - dmu_buf_unlock_parent(db, dblt, FTAG); - if (dirty) { - dmu_buf_will_dirty(&db->db, tx); - data = db->db.db_data; - bzero(data + blkoff, head); - } - dbuf_rele(db, FTAG); - } + dnode_partial_zero(dn, off, blkoff, head, tx); off += head; len -= head; } /* If the range was less than one block, we're done */ if (len == 0) return; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) return; ASSERT(ISP2(blksz)); if (trunc) tail = 0; else tail = P2PHASE(len, blksz); ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { - int res; if (len < tail) tail = len; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), - TRUE, FALSE, FTAG, &db); - rw_exit(&dn->dn_struct_rwlock); - if (res == 0) { - boolean_t dirty; - /* don't dirty if not on disk and not dirty */ - db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER, - FTAG); - dirty = !list_is_empty(&db->db_dirty_records) || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); - dmu_buf_unlock_parent(db, type, FTAG); - if (dirty) { - dmu_buf_will_dirty(&db->db, tx); - bzero(db->db.db_data, tail); - } - dbuf_rele(db, FTAG); - } + dnode_partial_zero(dn, off + len, 0, tail, tx); len -= tail; } /* If the range did not include a full block, we are done */ if (len == 0) return; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); blkid = off >> blkshift; nblks = len >> blkshift; if (trunc) nblks += 1; /* * Dirty all the indirect blocks in this range. Note that only * the first and last indirect blocks can actually be written * (if they were partially freed) -- they must be dirtied, even if * they do not exist on disk yet. The interior blocks will * be freed by free_children(), so they will not actually be written. * Even though these interior blocks will not be written, we * dirty them for two reasons: * * - It ensures that the indirect blocks remain in memory until * syncing context. (They have already been prefetched by * dmu_tx_hold_free(), so we don't have to worry about reading * them serially here.) * * - The dirty space accounting will put pressure on the txg sync * mechanism to begin syncing, and to delay transactions if there * is a large amount of freeing. Even though these indirect * blocks will not be written, we could need to write the same * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER); uint64_t first, last; first = blkid >> epbs; dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; if (last != first) dnode_dirty_l1(dn, last, tx); dnode_dirty_l1range(dn, first, last, tx); int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { /* * Set i to the blockid of the next non-hole * level-1 indirect block at or after i. Note * that dnode_next_offset() operates in terms of * level-0-equivalent bytes. */ uint64_t ibyte = i << shift; int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); i = ibyte >> shift; if (i >= last) break; /* * Normally we should not see an error, either * from dnode_next_offset() or dbuf_hold_level() * (except for ESRCH from dnode_next_offset). * If there is an i/o error, then when we read * this block in syncing context, it will use * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according * to the "failmode" property. dnode_next_offset() * doesn't have a flag to indicate MUSTSUCCEED. */ if (err != 0) break; dnode_dirty_l1(dn, i, tx); } rw_exit(&dn->dn_struct_rwlock); } done: /* * Add this range to the dnode range list. * We will finish up this free operation in the syncing phase. */ mutex_enter(&dn->dn_mtx); { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { dn->dn_free_ranges[txgoff] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); } dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", (u_longlong_t)blkid, (u_longlong_t)nblks, (u_longlong_t)tx->tx_txg); mutex_exit(&dn->dn_mtx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); } static boolean_t dnode_spill_freed(dnode_t *dn) { int i; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) { void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; if (blkid == DMU_BONUS_BLKID) return (FALSE); /* * If we're in the process of opening the pool, dp will not be * set yet, but there shouldn't be anything dirty. */ if (dp == NULL) return (FALSE); if (dn->dn_free_txg) return (TRUE); if (blkid == DMU_SPILL_BLKID) return (dnode_spill_freed(dn)); mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_free_ranges[i] != NULL && range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* call from syncing context when we actually write/free space for this dnode */ void dnode_diduse_space(dnode_t *dn, int64_t delta) { uint64_t space; dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", dn, dn->dn_phys, (u_longlong_t)dn->dn_phys->dn_used, (longlong_t)delta); mutex_enter(&dn->dn_mtx); space = DN_USED_BYTES(dn->dn_phys); if (delta > 0) { ASSERT3U(space + delta, >=, space); /* no overflow */ } else { ASSERT3U(space, >=, -delta); /* no underflow */ } space += delta; if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ASSERT0(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; } else { dn->dn_phys->dn_used = space; dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; } mutex_exit(&dn->dn_mtx); } /* * Scans a block at the indicated "level" looking for a hole or data, * depending on 'flags'. * * If level > 0, then we are scanning an indirect block looking at its * pointers. If level == 0, then we are looking at a block of dnodes. * * If we don't find what we are looking for in the block, we return ESRCH. * Otherwise, return with *offset pointing to the beginning (if searching * forwards) or end (if searching backwards) of the range covered by the * block pointer we matched on (or dnode). * * The basic search algorithm used below by dnode_next_offset() is to * use this function to search up the block tree (widen the search) until * we find something (i.e., we don't return ESRCH) and then search back * down the tree (narrow the search) until we reach our original search * level. */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; boolean_t hole; int i, inc, error, span; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); if (hole) return (0); /* * This can only happen when we are searching up * the block tree for data. We don't really need to * adjust the offset, as we will just end up looking * at the pointer to this block in its parent, and its * going to be unallocated, so we will skip over it. */ return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); if (error) { dbuf_rele(db, FTAG); return (error); } data = db->db.db_data; rw_enter(&db->db_rwlock, RW_READER); } if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. */ error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; ASSERT(dn->dn_type == DMU_OT_DNODE); ASSERT(!(flags & DNODE_FIND_BACKWARDS)); for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); i < blkfill; i += dnp[i].dn_extra_slots + 1) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; } if (i == blkfill) error = SET_ERROR(ESRCH); *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + (i << DNODE_SHIFT); } else { blkptr_t *bp = data; uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); if (hole) maxfill--; else minfill++; if (span >= 8 * sizeof (*offset)) { /* This only happens on the highest indirection level */ ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1); *offset = 0; } else { *offset = *offset >> span; } for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) *offset += inc; } if (span >= 8 * sizeof (*offset)) { *offset = start; } else { *offset = *offset << span; } if (inc < 0) { /* traversing backwards; position offset at the end */ ASSERT3U(*offset, <=, start); *offset = MIN(*offset + (1ULL << span) - 1, start); } else if (*offset < start) { *offset = start; } if (i < 0 || i >= epb) error = SET_ERROR(ESRCH); } if (db != NULL) { rw_exit(&db->db_rwlock); dbuf_rele(db, FTAG); } return (error); } /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, * DNODES_PER_BLOCK for the meta dnode, and some fraction of * DNODES_PER_BLOCK when searching for sparse regions thereof. * * Examples: * * dnode_next_offset(dn, flags, offset, 1, 1, 0); * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; if (!(flags & DNODE_FIND_HAVELOCK)) rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { error = SET_ERROR(ESRCH); goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = SET_ERROR(ESRCH); } goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); } /* * There's always a "virtual hole" at the end of the object, even * if all BP's which physically exist are non-holes. */ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { error = 0; } if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? initial_offset < *offset : initial_offset > *offset)) error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); return (error); } #if defined(_KERNEL) EXPORT_SYMBOL(dnode_hold); EXPORT_SYMBOL(dnode_rele); EXPORT_SYMBOL(dnode_set_nlevels); EXPORT_SYMBOL(dnode_set_blksz); EXPORT_SYMBOL(dnode_free_range); EXPORT_SYMBOL(dnode_evict_dbufs); EXPORT_SYMBOL(dnode_evict_bonus); #endif diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c index 354e021d9d26..35a379dded69 100644 --- a/sys/contrib/openzfs/module/zfs/refcount.c +++ b/sys/contrib/openzfs/module/zfs/refcount.c @@ -1,328 +1,336 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2021 by Delphix. All rights reserved. */ #include #include /* * Reference count tracking is disabled by default. It's memory requirements * are reasonable, however as implemented it consumes a significant amount of * cpu time. Until its performance is improved it should be manually enabled. */ int reference_tracking_enable = FALSE; int reference_history = 3; /* tunable */ #ifdef ZFS_DEBUG static kmem_cache_t *reference_cache; static kmem_cache_t *reference_history_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); reference_history_cache = kmem_cache_create("reference_history_cache", sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); kmem_cache_destroy(reference_history_cache); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&rc->rc_list, sizeof (reference_t), offsetof(reference_t, ref_link)); list_create(&rc->rc_removed, sizeof (reference_t), offsetof(reference_t, ref_link)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; } void zfs_refcount_create_tracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_TRUE; } void zfs_refcount_create_untracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_FALSE; } void zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; ASSERT3U(rc->rc_count, ==, number); while ((ref = list_head(&rc->rc_list))) { list_remove(&rc->rc_list, ref); kmem_cache_free(reference_cache, ref); } list_destroy(&rc->rc_list); while ((ref = list_head(&rc->rc_removed))) { list_remove(&rc->rc_removed, ref); kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); } list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } void zfs_refcount_destroy(zfs_refcount_t *rc) { zfs_refcount_destroy_many(rc, 0); } int zfs_refcount_is_zero(zfs_refcount_t *rc) { return (zfs_refcount_count(rc) == 0); } int64_t zfs_refcount_count(zfs_refcount_t *rc) { return (atomic_load_64(&rc->rc_count)); } int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { reference_t *ref = NULL; int64_t count; if (!rc->rc_tracked) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); } ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, 0); list_insert_head(&rc->rc_list, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } int64_t zfs_refcount_add(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_add_many(rc, 1, holder)); } int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { reference_t *ref; int64_t count; if (!rc->rc_tracked) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == holder && ref->ref_number == number) { list_remove(&rc->rc_list, ref); if (reference_history > 0) { ref->ref_removed = kmem_cache_alloc(reference_history_cache, KM_SLEEP); list_insert_head(&rc->rc_removed, ref); rc->rc_removed_count++; if (rc->rc_removed_count > reference_history) { ref = list_tail(&rc->rc_removed); list_remove(&rc->rc_removed, ref); kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); rc->rc_removed_count--; } } else { kmem_cache_free(reference_cache, ref); } rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } } panic("No such hold %p on refcount %llx", holder, (u_longlong_t)(uintptr_t)rc); return (-1); } int64_t zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_remove_many(rc, 1, holder)); } void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { int64_t count, removed_count; list_t list, removed; list_create(&list, sizeof (reference_t), offsetof(reference_t, ref_link)); list_create(&removed, sizeof (reference_t), offsetof(reference_t, ref_link)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; list_move_tail(&list, &src->rc_list); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; list_move_tail(&dst->rc_list, &list); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); list_destroy(&list); list_destroy(&removed); } void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { reference_t *ref; boolean_t found = B_FALSE; if (!rc->rc_tracked) return; mutex_enter(&rc->rc_mtx); for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == current_holder && ref->ref_number == number) { ref->ref_holder = new_holder; found = B_TRUE; break; } } ASSERT(found); mutex_exit(&rc->rc_mtx); } void zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, const void *new_holder) { return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder, new_holder)); } /* * If tracking is enabled, return true if a reference exists that matches * the "holder" tag. If tracking is disabled, then return true if a reference * might be held. */ boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref; if (!rc->rc_tracked) return (zfs_refcount_count(rc) > 0); mutex_enter(&rc->rc_mtx); for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == holder) { mutex_exit(&rc->rc_mtx); return (B_TRUE); } } mutex_exit(&rc->rc_mtx); return (B_FALSE); } /* * If tracking is enabled, return true if a reference does not exist that * matches the "holder" tag. If tracking is disabled, always return true * since the reference might not be held. */ boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref; if (!rc->rc_tracked) return (B_TRUE); mutex_enter(&rc->rc_mtx); for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == holder) { mutex_exit(&rc->rc_mtx); return (B_FALSE); } } mutex_exit(&rc->rc_mtx); return (B_TRUE); } +EXPORT_SYMBOL(zfs_refcount_create); +EXPORT_SYMBOL(zfs_refcount_destroy); +EXPORT_SYMBOL(zfs_refcount_is_zero); +EXPORT_SYMBOL(zfs_refcount_count); +EXPORT_SYMBOL(zfs_refcount_add); +EXPORT_SYMBOL(zfs_refcount_remove); +EXPORT_SYMBOL(zfs_refcount_held); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, ,reference_tracking_enable, INT, ZMOD_RW, "Track reference holders to refcount_t objects"); ZFS_MODULE_PARAM(zfs, ,reference_history, INT, ZMOD_RW, "Maximum reference holders being tracked"); /* END CSTYLED */ #endif /* ZFS_DEBUG */ diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 2763bd8de1c4..0ba76f6b88d9 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -1,5423 +1,5447 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vd->vdev_enc_sysfs_path) == 0) vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); /* * Regardless whether this vdev was just added or it is being * expanded, the metaslab count has changed. Recalculate the * block limit. */ spa_log_sm_set_blocklimit(spa); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift == 0) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd) { vdev_ashift_optimize(vd); } } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { + char *old, *new; if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } + + /* + * Our enclosure sysfs path may have changed between imports + */ + old = dvd->vdev_enc_sysfs_path; + new = svd->vdev_enc_sysfs_path; + if ((old != NULL && new == NULL) || + (old == NULL && new != NULL) || + ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { + zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " + "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, + old, new); + + if (dvd->vdev_enc_sysfs_path) + spa_strfree(dvd->vdev_enc_sysfs_path); + + if (svd->vdev_enc_sysfs_path) { + dvd->vdev_enc_sysfs_path = spa_strdup( + svd->vdev_enc_sysfs_path); + } else { + dvd->vdev_enc_sysfs_path = NULL; + } + } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vdev_get_nparity(vd) != 0) minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect vdev. */ if (!vdev_is_concrete(vd)) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_class[t].vqc_active; vsx->vsx_pend_queue[t] = avl_numnodes( &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_IOCTL. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %lu active IOs", vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { range_seg64_t iter_rs = *logical_rs; range_seg64_t physical_rs; range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW, "Default limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below zed threshold)."); ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 8229bc9a93e5..9bd75c011ef9 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -1,911 +1,919 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static ulong_t zfs_fsync_sync_cnt = 4; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); return (0); } #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* * Lseek support for finding holes (cmd == SEEK_HOLE) and * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) { uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == F_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* file was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } int zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_holey_common(zp, cmd, off); ZFS_EXIT(zfsvfs); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ /*ARGSUSED*/ int zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: zp - inode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - O_SYNC flags; used to provide FRSYNC semantics. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * inode - atime updated if byte count > 0 */ /* ARGSUSED */ int zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; boolean_t frsync = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* We don't copy out anything useful for directories. */ if (Z_ISDIR(ZTOTYPE(zp))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } /* * Validate file offset */ if (zfs_uio_offset(uio) < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (zfs_uio_resid(uio) == 0) { ZFS_EXIT(zfsvfs); return (0); } #ifdef FRSYNC /* * If we're in FRSYNC mode, sync out this znode before reading it. * Only do this for non-snapshots. * * Some platforms do not support FRSYNC and instead map it * to O_SYNC, which results in unnecessary calls to zil_commit. We * only honor FRSYNC requests on platforms which support it. */ frsync = !!(ioflag & FRSYNC); #endif if (zfsvfs->z_log && (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (zfs_uio_offset(uio) >= zp->z_size) { error = 0; goto out; } ASSERT(zfs_uio_offset(uio) < zp->z_size); + ssize_t start_offset = zfs_uio_offset(uio); ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; while (n > 0) { ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); + /* + * if we actually read some bytes, bubbling EFAULT + * up to become EAGAIN isn't what we want here. + */ + if (error == EFAULT && + (zfs_uio_offset(uio) - start_offset) != 0) + error = 0; break; } n -= nbytes; } int64_t nread = start_resid - n; dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); task_io_account_read(nread); out: zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: zp - znode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - O_APPEND flag set if in append mode. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; ssize_t start_resid = zfs_uio_resid(uio); /* * Fasttrack empty write */ ssize_t n = start_resid; if (n == 0) return (0); zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); sa_bulk_attr_t bulk[4]; int count = 0; uint64_t mtime[2], ctime[2]; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && (zfs_uio_offset(uio) < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Validate file offset */ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } const uint64_t max_blksz = zfsvfs->z_max_blksz; /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EFAULT)); } /* * If in append mode, set the io offset pointer to eof. */ zfs_locked_range_t *lr; if (ioflag & O_APPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } zfs_uio_setoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (zn_rlimit_fsize(zp, uio)) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } const rlim64_t limit = MAXOFFSET_T; if (woff >= limit) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if (n > limit - woff) n = limit - woff; uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); const uint64_t projid = zp->z_projid; /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { woff = zfs_uio_offset(uio); if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid))) { error = SET_ERROR(EDQUOT); break; } arc_buf_t *abuf = NULL; if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if ((error = zfs_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } ASSERT3S(cbytes, ==, max_blksz); } /* * Start a transaction. */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, MIN(n, max_blksz)); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ const ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { dmu_tx_commit(tx); /* * Account for partial writes before * continuing the loop. * Update needs to occur before the next * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ if (tx_bytes != zfs_uio_resid(uio)) n -= tx_bytes - zfs_uio_resid(uio); if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { break; } continue; } #endif if (error != 0) { dmu_tx_commit(tx); break; } tx_bytes -= zfs_uio_resid(uio); } else { /* Implied by abuf != NULL: */ ASSERT3S(n, >=, max_blksz); ASSERT0(P2PHASE(woff, max_blksz)); /* * We can simplify nbytes to MIN(n, max_blksz) since * P2PHASE(woff, max_blksz) is 0, and knowing * n >= max_blksz lets us simplify further: */ ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. * * dmu_assign_arcbuf_by_dbuf() will directly assign the * arc buffer to a dbuf. */ error = dmu_assign_arcbuf_by_dbuf( sa_get_db(zp->z_sa_hdl), woff, abuf, tx); if (error != 0) { dmu_return_arcbuf(abuf); dmu_tx_commit(tx); break; } ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } if (tx_bytes && zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * * It would be nice to do this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(zp, cr, ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, zfs_uio_offset(uio)); ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, NULL, NULL); dmu_tx_commit(tx); if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; if (n > 0) { if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { error = SET_ERROR(EFAULT); break; } } } zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's * at least a partial write, so it's successful. */ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (O_SYNC | O_DSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); ZFS_EXIT(zfsvfs); return (0); } /*ARGSUSED*/ int zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif static void zfs_get_done(zgd_t *zgd, int error); /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } /* check if generation number matches */ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (zp_gen)) != 0) { zfs_zrele_async(zp); return (SET_ERROR(EIO)); } if (zp_gen != gen) { zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /* ARGSUSED */ static void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); kmem_free(zgd, sizeof (zgd_t)); } EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_holey); EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, "Bytes to read per chunk"); diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run index b7831c3acfd1..01e1f79e5852 100644 --- a/sys/contrib/openzfs/tests/runfiles/linux.run +++ b/sys/contrib/openzfs/tests/runfiles/linux.run @@ -1,174 +1,174 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/acl/posix:Linux] tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] tags = ['functional', 'acl', 'posix'] [tests/functional/acl/posix-sa:Linux] tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] tags = ['functional', 'acl', 'posix-sa'] [tests/functional/atime:Linux] tests = ['atime_003_pos', 'root_relatime_on'] tags = ['functional', 'atime'] [tests/functional/chattr:Linux] tests = ['chattr_001_pos', 'chattr_002_neg'] tags = ['functional', 'chattr'] [tests/functional/checksum:Linux] tests = ['run_edonr_test'] tags = ['functional', 'checksum'] [tests/functional/cli_root/zfs:Linux] tests = ['zfs_003_neg'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_mount:Linux] tests = ['zfs_mount_006_pos', 'zfs_mount_008_pos', 'zfs_mount_013_pos', 'zfs_mount_014_neg', 'zfs_multi_mount'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_share:Linux] tests = ['zfs_share_005_pos', 'zfs_share_007_neg', 'zfs_share_009_neg', - 'zfs_share_012_pos'] + 'zfs_share_012_pos', 'zfs_share_013_pos'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_sysfs:Linux] tests = ['zfeature_set_unsupported', 'zfs_get_unsupported', 'zfs_set_unsupported', 'zfs_sysfs_live', 'zpool_get_unsupported', 'zpool_set_unsupported'] tags = ['functional', 'cli_root', 'zfs_sysfs'] [tests/functional/cli_root/zpool_add:Linux] tests = ['add_nested_replacing_spare'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_expand:Linux] tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] tags = ['functional', 'cli_root', 'zpool_expand'] [tests/functional/cli_root/zpool_reopen:Linux] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', 'zpool_reopen_006_neg', 'zpool_reopen_007_pos'] tags = ['functional', 'cli_root', 'zpool_reopen'] [tests/functional/cli_root/zpool_split:Linux] tests = ['zpool_split_wholedisk'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/compression:Linux] tests = ['compress_004_pos'] tags = ['functional', 'compression'] [tests/functional/devices:Linux] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill'] tags = ['functional', 'events'] [tests/functional/fallocate:Linux] tests = ['fallocate_prealloc'] tags = ['functional', 'fallocate'] [tests/functional/fault:Linux] tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/io:Linux] tests = ['libaio', 'io_uring'] tags = ['functional', 'io'] [tests/functional/mmap:Linux] tests = ['mmap_libaio_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mmp:Linux] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] tests = ['pam_basic', 'pam_nounmount'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] tests = ['procfs_list_basic', 'procfs_list_concurrent_readers', 'procfs_list_stale_read', 'pool_state'] tags = ['functional', 'procfs'] [tests/functional/projectquota:Linux] tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', 'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos', 'projectquota_004_neg', 'projectquota_005_pos', 'projectquota_006_pos', 'projectquota_007_pos', 'projectquota_008_pos', 'projectquota_009_pos', 'projectspace_001_pos', 'projectspace_002_pos', 'projectspace_003_pos', 'projectspace_004_pos', 'projecttree_001_pos', 'projecttree_002_pos', 'projecttree_003_neg'] tags = ['functional', 'projectquota'] [tests/functional/rsend:Linux] tests = ['send_realloc_dnode_size', 'send_encrypted_files'] tags = ['functional', 'rsend'] [tests/functional/snapshot:Linux] tests = ['snapshot_015_pos', 'snapshot_016_pos'] tags = ['functional', 'snapshot'] [tests/functional/tmpfile:Linux] tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos', 'tmpfile_stat_mode'] tags = ['functional', 'tmpfile'] [tests/functional/upgrade:Linux] tests = ['upgrade_projectquota_001_pos'] tags = ['functional', 'upgrade'] [tests/functional/user_namespace:Linux] tests = ['user_namespace_001'] tags = ['functional', 'user_namespace'] [tests/functional/userquota:Linux] tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos', 'userquota_013_pos', 'userspace_003_pos'] tags = ['functional', 'userquota'] diff --git a/sys/contrib/openzfs/tests/runfiles/sanity.run b/sys/contrib/openzfs/tests/runfiles/sanity.run index ad4495144cda..2a9b4adf5fb9 100644 --- a/sys/contrib/openzfs/tests/runfiles/sanity.run +++ b/sys/contrib/openzfs/tests/runfiles/sanity.run @@ -1,623 +1,623 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # This run file contains a subset of functional tests which exercise # as much functionality as possible while still executing relatively # quickly. The included tests should take no more than a few seconds # each to run at most. This provides a convenient way to sanity test a # change before committing to a full test run which takes several hours. # # Approximate run time: 15 minutes # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 180 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/acl/off] tests = ['posixmode'] tags = ['functional', 'acl'] [tests/functional/alloc_class] tests = ['alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_008_pos', 'alloc_class_010_pos', 'alloc_class_011_neg'] tags = ['functional', 'alloc_class'] [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'arcstats_runtime_tuning'] tags = ['functional', 'arc'] [tests/functional/bootfs] tests = ['bootfs_004_neg', 'bootfs_007_pos'] tags = ['functional', 'bootfs'] [tests/functional/cache] tests = ['cache_004_neg', 'cache_005_neg', 'cache_007_neg', 'cache_010_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'sensitive_none_lookup', 'sensitive_none_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'mixed_none_lookup', 'mixed_none_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', 'tst.list_children', 'tst.list_clones', 'tst.list_holds', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy'] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/cli_root/zdb] tests = ['zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos'] pre = post = tags = ['functional', 'cli_root', 'zdb'] [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_encrypted'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_dryrun', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_cliargs', 'zfs_diff_encrypted'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_003_pos', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'zfs_receive_016_pos', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_encrypted', 'zfs_send_raw'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'zfs_set_keylocation', 'zfs_set_feature_activation'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_unload_keys'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zfs_wait] tests = ['zfs_wait_deleteq'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_002_neg'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_encrypted', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_follow', 'zpool_events_poolname'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg', 'zpool_get_005_pos'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_003_pos', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_014_pos', 'zpool_import_features_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted'] tags = ['functional', 'cli_root', 'zpool_import'] [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_online_offline'] pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_003_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_indirect'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos'] tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_009_neg'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_root/zpool_wait] tests = ['zpool_wait_no_activity', 'zpool_wait_usage'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] tests = ['zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/compression] -tests = ['compress_003_pos'] +tests = ['compress_003_pos','compress_zstd_bswap'] tags = ['functional', 'compression'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/features/large_dnode] tests = ['large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_004_pos', 'history_005_neg', 'history_006_neg', 'history_007_pos', 'history_008_pos', 'history_009_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] tests = ['run_hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inuse] tests = ['inuse_004_pos', 'inuse_005_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/limits] tests = ['filesystem_count', 'snapshot_count'] tags = ['functional', 'limits'] [tests/functional/link_count] tests = ['link_count_root_inode'] tags = ['functional', 'link_count'] [tests/functional/log_spacemap] tests = ['log_spacemap_import_logs'] pre = post = tags = ['functional', 'log_spacemap'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_read_001_pos'] tags = ['functional', 'mmap'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/nopwrite] tests = ['nopwrite_sync', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/pool_checkpoint] tests = ['checkpoint_conf_change', 'checkpoint_discard_many', 'checkpoint_removal', 'checkpoint_sm_scale', 'checkpoint_twice'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/redacted_send] tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', 'redacted_disabled_feature', 'redacted_incrementals', 'redacted_largeblocks', 'redacted_mixed_recsize', 'redacted_negative', 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size'] tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg'] tags = ['functional', 'raidz'] [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', 'refquota_007_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_005_pos', 'refreserv_multi_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_sanity', 'removal_with_dedup', 'removal_with_ganging', 'removal_with_faulted'] tags = ['functional', 'removal'] [tests/functional/replacement] tests = ['rebuild_raidz'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_014_pos', 'rsend_016_neg', 'send-c_verify_contents', 'send-c_volume', 'send-c_zstreamdump', 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_holds', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_008_neg', 'slog_009_neg', 'slog_010_neg'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_002_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/suid] tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', 'suid_write_to_none'] tags = ['functional', 'suid'] [tests/functional/threadsappend] tests = ['threadsappend_001_pos'] tags = ['functional', 'threadsappend'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_011_pos', 'xattr_013_pos'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/zpool_influxdb] tests = ['zpool_influxdb'] tags = ['functional', 'zpool_influxdb'] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index 1acb8b9c1be2..e9f3ab1728d6 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -1,455 +1,457 @@ #!/usr/bin/env @PYTHON_SHEBANG@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2017 by Delphix. All rights reserved. # Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # # This script must remain compatible with Python 2.6+ and Python 3.4+. # import os import re import sys # # This script parses the stdout of zfstest, which has this format: # # Test: /path/to/testa (run as root) [00:00] [PASS] # Test: /path/to/testb (run as jkennedy) [00:00] [PASS] # Test: /path/to/testc (run as root) [00:00] [FAIL] # [...many more results...] # # Results Summary # FAIL 22 # SKIP 32 # PASS 1156 # # Running Time: 02:50:31 # Percent passed: 95.5% # Log directory: /var/tmp/test_results/20180615T205926 # # # Common generic reasons for a test or test group to be skipped. # # Some test cases are known to fail in ways which are not harmful or dangerous. # In these cases simply mark the test as a known failure until it can be # updated and the issue resolved. Note that it's preferable to open a unique # issue on the GitHub issue tracker for each test case failure. # known_reason = 'Known issue' # # Some tests require that a test user be able to execute the zfs utilities. # This may not be possible when testing in-tree due to the default permissions # on the user's home directory. When testing this can be resolved by granting # group read access. # # chmod 0750 $HOME # exec_reason = 'Test user execute permissions required for utilities' # # Some tests require a minimum python version of 3.5 and will be skipped when # the default system version is too old. There may also be tests which require # additional python modules be installed, for example python-cffi is required # by the pyzfs tests. # python_reason = 'Python v3.5 or newer required' python_deps_reason = 'Python modules missing: python-cffi' # # Some tests require the O_TMPFILE flag which was first introduced in the # 3.11 kernel. # tmpfile_reason = 'Kernel O_TMPFILE support required' # # Some tests require the statx(2) system call on Linux which was first # introduced in the 4.11 kernel. # statx_reason = 'Kernel statx(2) system call required on Linux' # # Some tests require that the NFS client and server utilities be installed. # share_reason = 'NFS client and server utilities required' # # Some tests require that the lsattr utility support the project id feature. # project_id_reason = 'lsattr with set/show project ID required' # # Some tests require that the kernel support user namespaces. # user_ns_reason = 'Kernel user namespace support required' # # Some rewind tests can fail since nothing guarantees that old MOS blocks # are not overwritten. Snapshots protect datasets and data files but not # the MOS. Reasonable efforts are made in the test case to increase the # odds that some txgs will have their MOS data left untouched, but it is # never a sure thing. # rewind_reason = 'Arbitrary pool rewind is not guaranteed' # # Some tests may by structured in a way that relies on exact knowledge # of how much free space in available in a pool. These tests cannot be # made completely reliable because the internal details of how free space # is managed are not exposed to user space. # enospc_reason = 'Exact free space reporting is not guaranteed' # # Some tests require a minimum version of the fio benchmark utility. # Older distributions such as CentOS 6.x only provide fio-2.0.13. # fio_reason = 'Fio v2.3 or newer required' # # Some tests require that the DISKS provided support the discard operation. # Normally this is not an issue because loop back devices are used for DISKS # and they support discard (TRIM/UNMAP). # trim_reason = 'DISKS must support discard (TRIM/UNMAP)' # # Some tests on FreeBSD require the fspacectl(2) system call and the # truncate(1) utility supporting the -d option. The system call was first # introduced in FreeBSD version 1400032. # fspacectl_reason = 'fspacectl(2) and truncate -d support required' # # Some tests are not applicable to a platform or need to be updated to operate # in the manor required by the platform. Any tests which are skipped for this # reason will be suppressed in the final analysis output. # na_reason = "Not applicable" # # Some test cases doesn't have all requirements to run on Github actions CI. # ci_reason = 'CI runner doesn\'t have all requirements' summary = { 'total': float(0), 'passed': float(0), 'logfile': "Could not determine logfile location." } # # These tests are known to fail, thus we use this list to prevent these # failures from failing the job as a whole; only unexpected failures # bubble up to cause this script to exit with a non-zero exit status. # # Format: { 'test-name': ['expected result', 'issue-number | reason'] } # # For each known failure it is recommended to link to a GitHub issue by # setting the reason to the issue number. Alternately, one of the generic # reasons listed above can be used. # known = { 'casenorm/mixed_none_lookup_ci': ['FAIL', '7633'], 'casenorm/mixed_formd_lookup_ci': ['FAIL', '7633'], 'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason], 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], 'rootpool/setup': ['SKIP', na_reason], 'rsend/rsend_008_pos': ['SKIP', '6066'], 'vdev_zaps/vdev_zaps_007_pos': ['FAIL', known_reason], } if sys.platform.startswith('freebsd'): known.update({ 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'link_count/link_count_001': ['SKIP', na_reason], }) elif sys.platform.startswith('linux'): known.update({ 'casenorm/mixed_formd_lookup': ['FAIL', '7633'], 'casenorm/mixed_formd_delete': ['FAIL', '7633'], 'casenorm/sensitive_formd_lookup': ['FAIL', '7633'], 'casenorm/sensitive_formd_delete': ['FAIL', '7633'], 'removal/removal_with_zdb': ['SKIP', known_reason], }) # # These tests may occasionally fail or be skipped. We want there failures # to be reported but only unexpected failures should bubble up to cause # this script to exit with a non-zero exit status. # # Format: { 'test-name': ['expected result', 'issue-number | reason'] } # # For each known failure it is recommended to link to a GitHub issue by # setting the reason to the issue number. Alternately, one of the generic # reasons listed above can be used. # maybe = { 'chattr/setup': ['SKIP', exec_reason], 'crtime/crtime_001_pos': ['SKIP', statx_reason], 'cli_root/zdb/zdb_006_pos': ['FAIL', known_reason], 'cli_root/zfs_destroy/zfs_destroy_dev_removal_condense': ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_004_pos': ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_009_pos': ['SKIP', '5479'], 'cli_root/zfs_rollback/zfs_rollback_001_pos': ['FAIL', known_reason], 'cli_root/zfs_rollback/zfs_rollback_002_pos': ['FAIL', known_reason], 'cli_root/zfs_share/setup': ['SKIP', share_reason], 'cli_root/zfs_snapshot/zfs_snapshot_002_neg': ['FAIL', known_reason], 'cli_root/zfs_unshare/setup': ['SKIP', share_reason], 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', '6145'], 'cli_root/zpool_import/import_rewind_device_replaced': ['FAIL', rewind_reason], 'cli_root/zpool_import/import_rewind_config_changed': ['FAIL', rewind_reason], 'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', '6839'], 'cli_root/zpool_initialize/zpool_initialize_import_export': ['FAIL', '11948'], 'cli_root/zpool_labelclear/zpool_labelclear_removed': ['FAIL', known_reason], 'cli_root/zpool_trim/setup': ['SKIP', trim_reason], 'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', '6141'], 'delegate/setup': ['SKIP', exec_reason], 'fallocate/fallocate_punch-hole': ['SKIP', fspacectl_reason], 'history/history_004_pos': ['FAIL', '7026'], 'history/history_005_neg': ['FAIL', '6680'], 'history/history_006_neg': ['FAIL', '5657'], 'history/history_008_pos': ['FAIL', known_reason], 'history/history_010_pos': ['SKIP', exec_reason], 'io/mmap': ['SKIP', fio_reason], 'largest_pool/largest_pool_001_pos': ['FAIL', known_reason], 'mmp/mmp_on_uberblocks': ['FAIL', known_reason], 'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason], 'no_space/enospc_002_pos': ['FAIL', enospc_reason], 'pool_checkpoint/checkpoint_discard_busy': ['FAIL', '11946'], 'projectquota/setup': ['SKIP', exec_reason], 'redundancy/redundancy_004_neg': ['FAIL', '7290'], 'redundancy/redundancy_draid_spare3': ['SKIP', known_reason], 'removal/removal_condense_export': ['FAIL', known_reason], 'reservation/reservation_008_pos': ['FAIL', '7741'], 'reservation/reservation_018_pos': ['FAIL', '5642'], 'rsend/rsend_019_pos': ['FAIL', '6086'], 'rsend/rsend_020_pos': ['FAIL', '6446'], 'rsend/rsend_021_pos': ['FAIL', '6446'], 'rsend/rsend_024_pos': ['FAIL', '5665'], 'rsend/send-c_volume': ['FAIL', '6087'], 'rsend/send_partial_dataset': ['FAIL', known_reason], 'snapshot/clone_001_pos': ['FAIL', known_reason], 'snapshot/snapshot_009_pos': ['FAIL', '7961'], 'snapshot/snapshot_010_pos': ['FAIL', '7961'], 'snapused/snapused_004_pos': ['FAIL', '5513'], 'tmpfile/setup': ['SKIP', tmpfile_reason], 'threadsappend/threadsappend_001_pos': ['FAIL', '6136'], 'trim/setup': ['SKIP', trim_reason], 'upgrade/upgrade_projectquota_001_pos': ['SKIP', project_id_reason], 'user_namespace/setup': ['SKIP', user_ns_reason], 'userquota/setup': ['SKIP', exec_reason], 'vdev_zaps/vdev_zaps_004_pos': ['FAIL', '6935'], 'zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos': ['FAIL', '5848'], 'pam/setup': ['SKIP', "pamtester might be not available"], } if sys.platform.startswith('freebsd'): maybe.update({ 'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason], 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], 'cli_root/zfs_receive/receive-o-x_props_override': ['FAIL', known_reason], 'cli_root/zfs_share/zfs_share_011_pos': ['FAIL', known_reason], 'cli_root/zfs_share/zfs_share_concurrent_shares': ['FAIL', known_reason], 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], 'inheritance/inherit_001_pos': ['FAIL', '11829'], 'resilver/resilver_restart_001': ['FAIL', known_reason], - 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], + 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', '12622'], + 'pool_checkpoint/checkpoint_indirect': ['FAIL', '12623'], }) elif sys.platform.startswith('linux'): maybe.update({ 'alloc_class/alloc_class_009_pos': ['FAIL', known_reason], 'alloc_class/alloc_class_010_pos': ['FAIL', known_reason], 'alloc_class/alloc_class_011_neg': ['FAIL', known_reason], 'alloc_class/alloc_class_012_pos': ['FAIL', known_reason], 'alloc_class/alloc_class_013_pos': ['FAIL', '11888'], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_expand/zpool_expand_001_pos': ['FAIL', known_reason], 'cli_root/zpool_expand/zpool_expand_005_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], 'fault/auto_spare_shared': ['FAIL', '11889'], 'io/io_uring': ['SKIP', 'io_uring support required'], 'limits/filesystem_limit': ['SKIP', known_reason], 'limits/snapshot_limit': ['SKIP', known_reason], 'mmp/mmp_active_import': ['FAIL', known_reason], 'mmp/mmp_exported_import': ['FAIL', known_reason], 'mmp/mmp_inactive_import': ['FAIL', known_reason], 'refreserv/refreserv_raidz': ['FAIL', known_reason], 'rsend/rsend_007_pos': ['FAIL', known_reason], 'rsend/rsend_010_pos': ['FAIL', known_reason], 'rsend/rsend_011_pos': ['FAIL', known_reason], 'snapshot/rollback_003_pos': ['FAIL', known_reason], + 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', '12621'], }) # Not all Github actions runners have scsi_debug module, so we may skip # some tests which use it. if os.environ.get('CI') == 'true': known.update({ 'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', ci_reason], 'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', ci_reason], 'cli_root/zpool_expand/zpool_expand_005_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/setup': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_001_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_002_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_004_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_005_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_006_neg': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_007_pos': ['SKIP', ci_reason], 'cli_root/zpool_split/zpool_split_wholedisk': ['SKIP', ci_reason], 'fault/auto_offline_001_pos': ['SKIP', ci_reason], 'fault/auto_online_001_pos': ['SKIP', ci_reason], 'fault/auto_online_002_pos': ['SKIP', ci_reason], 'fault/auto_replace_001_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], }) maybe.update({ 'events/events_002_pos': ['FAIL', '11546'], }) def usage(s): print(s) sys.exit(1) def process_results(pathname): try: f = open(pathname) except IOError as e: print('Error opening file: %s' % e) sys.exit(1) prefix = '/zfs-tests/tests/functional/' pattern = \ r'^Test(?:\s+\(\S+\))?:' + \ r'\s*\S*%s(\S+)\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' \ % prefix pattern_log = r'^\s*Log directory:\s*(\S*)' d = {} for line in f.readlines(): m = re.match(pattern, line) if m and len(m.groups()) == 4: summary['total'] += 1 if m.group(4) == "PASS": summary['passed'] += 1 d[m.group(1)] = m.group(4) continue m = re.match(pattern_log, line) if m: summary['logfile'] = m.group(1) return d if __name__ == "__main__": if len(sys.argv) != 2: usage('usage: %s ' % sys.argv[0]) results = process_results(sys.argv[1]) if summary['total'] == 0: print("\n\nNo test results were found.") print("Log directory: %s" % summary['logfile']) sys.exit(0) expected = [] unexpected = [] for test in list(results.keys()): if results[test] == "PASS": continue setup = test.replace(os.path.basename(test), "setup") if results[test] == "SKIP" and test != setup: if setup in known and known[setup][0] == "SKIP": continue if setup in maybe and maybe[setup][0] == "SKIP": continue if ((test not in known or results[test] not in known[test][0]) and (test not in maybe or results[test] not in maybe[test][0])): unexpected.append(test) else: expected.append(test) print("\nTests with results other than PASS that are expected:") for test in sorted(expected): issue_url = 'https://github.com/openzfs/zfs/issues/' # Include the reason why the result is expected, given the following: # 1. Suppress test results which set the "Not applicable" reason. # 2. Numerical reasons are assumed to be GitHub issue numbers. # 3. When an entire test group is skipped only report the setup reason. if test in known: if known[test][1] == na_reason: continue elif known[test][1].isdigit(): expect = issue_url + known[test][1] else: expect = known[test][1] elif test in maybe: if maybe[test][1].isdigit(): expect = issue_url + maybe[test][1] else: expect = maybe[test][1] elif setup in known and known[setup][0] == "SKIP" and setup != test: continue elif setup in maybe and maybe[setup][0] == "SKIP" and setup != test: continue else: expect = "UNKNOWN REASON" print(" %s %s (%s)" % (results[test], test, expect)) print("\nTests with result of PASS that are unexpected:") for test in sorted(known.keys()): # We probably should not be silently ignoring the case # where "test" is not in "results". if test not in results or results[test] != "PASS": continue print(" %s %s (expected %s)" % (results[test], test, known[test][0])) print("\nTests with results other than PASS that are unexpected:") for test in sorted(unexpected): expect = "PASS" if test not in known else known[test][0] print(" %s %s (expected %s)" % (results[test], test, expect)) if len(unexpected) == 0: sys.exit(0) else: sys.exit(1) diff --git a/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib b/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib index 5a7e76c0ddbf..c9c01ab752ea 100644 --- a/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib +++ b/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib @@ -1,530 +1,580 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Copyright (c) 2012, 2020 by Delphix. All rights reserved. # . ${STF_TOOLS}/include/stf.shlib # Output an assertion # # $@ - assertion text function log_assert { _printline ASSERTION: "$@" } # Output a comment # # $@ - comment text function log_note { _printline NOTE: "$@" } # Execute and print command with status where success equals non-zero result # # $@ - command to execute # # return 0 if command fails, otherwise return 1 function log_neg { log_neg_expect "" "$@" return $? } # Execute a positive test and exit $STF_FAIL is test fails # # $@ - command to execute function log_must { log_pos "$@" (( $? != 0 )) && log_fail } +# Execute a positive test (expecting no stderr) and exit $STF_FAIL +# if test fails +# $@ - command to execute + +function log_must_nostderr +{ + log_pos_nostderr "$@" + (( $? != 0 )) && log_fail +} + # Execute a positive test but retry the command on failure if the output # matches an expected pattern. Otherwise behave like log_must and exit # $STF_FAIL is test fails. # # $1 - retry keyword # $2 - retry attempts # $3-$@ - command to execute # function log_must_retry { typeset out="" typeset logfile="/tmp/log.$$" typeset status=1 typeset expect=$1 typeset retry=$2 typeset delay=1 shift 2 while [[ -e $logfile ]]; do logfile="$logfile.$$" done while (( $retry > 0 )); do "$@" 2>$logfile status=$? out="cat $logfile" if (( $status == 0 )); then $out | egrep -i "internal error|assertion failed" \ > /dev/null 2>&1 # internal error or assertion failed if [[ $? -eq 0 ]]; then print -u2 $($out) _printerror "$@" "internal error or" \ " assertion failure exited $status" status=1 else [[ -n $LOGAPI_DEBUG ]] && cat $logfile _printsuccess "$@" fi break else $out | grep -i "$expect" > /dev/null 2>&1 if (( $? == 0 )); then print -u2 $($out) _printerror "$@" "Retry in $delay seconds" sleep $delay (( retry=retry - 1 )) (( delay=delay * 2 )) else break; fi fi done if (( $status != 0 )) ; then print -u2 $($out) _printerror "$@" "exited $status" fi _recursive_output $logfile "false" return $status } # Execute a positive test and exit $STF_FAIL is test fails after being # retried up to 5 times when the command returns the keyword "busy". # # $@ - command to execute function log_must_busy { log_must_retry "busy" 5 "$@" (( $? != 0 )) && log_fail } # Execute a negative test and exit $STF_FAIL if test passes # # $@ - command to execute function log_mustnot { log_neg "$@" (( $? != 0 )) && log_fail } # Execute a negative test with keyword expected, and exit # $STF_FAIL if test passes # # $1 - keyword expected # $2-$@ - command to execute function log_mustnot_expect { log_neg_expect "$@" (( $? != 0 )) && log_fail } # Signal numbers are platform-dependent case $(uname) in Darwin|FreeBSD) SIGBUS=10 SIGSEGV=11 ;; illumos|Linux|*) SIGBUS=7 SIGSEGV=11 ;; esac EXIT_SUCCESS=0 EXIT_NOTFOUND=127 EXIT_SIGNAL=256 EXIT_SIGBUS=$((EXIT_SIGNAL + SIGBUS)) EXIT_SIGSEGV=$((EXIT_SIGNAL + SIGSEGV)) # Execute and print command with status where success equals non-zero result # or output includes expected keyword # # $1 - keyword expected # $2-$@ - command to execute # # return 0 if command fails, or the output contains the keyword expected, # return 1 otherwise function log_neg_expect { typeset out="" typeset logfile="/tmp/log.$$" typeset ret=1 typeset expect=$1 shift while [[ -e $logfile ]]; do logfile="$logfile.$$" done "$@" 2>$logfile typeset status=$? out="cat $logfile" # unexpected status if (( $status == EXIT_SUCCESS )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status" # missing binary elif (( $status == EXIT_NOTFOUND )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (File not found)" # bus error - core dump elif (( $status == EXIT_SIGBUS )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (Bus Error)" # segmentation violation - core dump elif (( $status == EXIT_SIGSEGV )); then print -u2 $($out) _printerror "$@" "unexpectedly exited $status (SEGV)" else $out | egrep -i "internal error|assertion failed" \ > /dev/null 2>&1 # internal error or assertion failed if (( $? == 0 )); then print -u2 $($out) _printerror "$@" "internal error or assertion failure" \ " exited $status" elif [[ -n $expect ]] ; then $out | grep -i "$expect" > /dev/null 2>&1 if (( $? == 0 )); then ret=0 else print -u2 $($out) _printerror "$@" "unexpectedly exited $status" fi else ret=0 fi if (( $ret == 0 )); then [[ -n $LOGAPI_DEBUG ]] && cat $logfile _printsuccess "$@" "exited $status" fi fi _recursive_output $logfile "false" return $ret } # Execute and print command with status where success equals zero result # # $@ command to execute # # return command exit status function log_pos { typeset out="" typeset logfile="/tmp/log.$$" while [[ -e $logfile ]]; do logfile="$logfile.$$" done "$@" 2>$logfile typeset status=$? out="cat $logfile" if (( $status != 0 )) ; then print -u2 $($out) _printerror "$@" "exited $status" else $out | egrep -i "internal error|assertion failed" \ > /dev/null 2>&1 # internal error or assertion failed if [[ $? -eq 0 ]]; then print -u2 $($out) _printerror "$@" "internal error or assertion failure" \ " exited $status" status=1 else [[ -n $LOGAPI_DEBUG ]] && cat $logfile _printsuccess "$@" fi fi _recursive_output $logfile "false" return $status } +# Execute and print command with status where success equals zero result +# and no stderr output +# +# $@ command to execute +# +# return 0 if command succeeds and no stderr output +# return 1 othersie + +function log_pos_nostderr +{ + typeset out="" + typeset logfile="/tmp/log.$$" + + while [[ -e $logfile ]]; do + logfile="$logfile.$$" + done + + "$@" 2>$logfile + typeset status=$? + out="cat $logfile" + typeset out_msg=$($out) + + if (( $status != 0 )) ; then + print -u2 $out_msg + _printerror "$@" "exited $status" + else + if [[ ! -z "$out_msg" ]]; then + print -u2 $out_msg + _printerror "$@" "message in stderr" \ + " exited $status" + status=1 + else + [[ -n $LOGAPI_DEBUG ]] && cat $logfile + _printsuccess "$@" + fi + fi + _recursive_output $logfile "false" + return $status +} + # Set an exit handler # # $@ - function(s) to perform on exit function log_onexit { _CLEANUP=("$*") } # Push an exit handler on the cleanup stack # # $@ - function(s) to perform on exit function log_onexit_push { _CLEANUP+=("$*") } # Pop an exit handler off the cleanup stack function log_onexit_pop { _CLEANUP=("${_CLEANUP[@]:0:${#_CLEANUP[@]}-1}") } # # Exit functions # # Perform cleanup and exit $STF_PASS # # $@ - message text function log_pass { _endlog $STF_PASS "$@" } # Perform cleanup and exit $STF_FAIL # # $@ - message text function log_fail { _endlog $STF_FAIL "$@" } # Perform cleanup and exit $STF_UNRESOLVED # # $@ - message text function log_unresolved { _endlog $STF_UNRESOLVED "$@" } # Perform cleanup and exit $STF_NOTINUSE # # $@ - message text function log_notinuse { _endlog $STF_NOTINUSE "$@" } # Perform cleanup and exit $STF_UNSUPPORTED # # $@ - message text function log_unsupported { _endlog $STF_UNSUPPORTED "$@" } # Perform cleanup and exit $STF_UNTESTED # # $@ - message text function log_untested { _endlog $STF_UNTESTED "$@" } # Perform cleanup and exit $STF_UNINITIATED # # $@ - message text function log_uninitiated { _endlog $STF_UNINITIATED "$@" } # Perform cleanup and exit $STF_NORESULT # # $@ - message text function log_noresult { _endlog $STF_NORESULT "$@" } # Perform cleanup and exit $STF_WARNING # # $@ - message text function log_warning { _endlog $STF_WARNING "$@" } # Perform cleanup and exit $STF_TIMED_OUT # # $@ - message text function log_timed_out { _endlog $STF_TIMED_OUT "$@" } # Perform cleanup and exit $STF_OTHER # # $@ - message text function log_other { _endlog $STF_OTHER "$@" } function set_main_pid { _MAINPID=$1 } # # Internal functions # # Execute custom callback scripts on test failure # # callback script paths are stored in TESTFAIL_CALLBACKS, delimited by ':'. function _execute_testfail_callbacks { typeset callback print "$TESTFAIL_CALLBACKS:" | while read -d ":" callback; do if [[ -n "$callback" ]] ; then log_note "Performing test-fail callback ($callback)" $callback fi done } # Perform cleanup and exit # # $1 - stf exit code # $2-$n - message text function _endlog { typeset logfile="/tmp/log.$$" _recursive_output $logfile typeset exitcode=$1 shift (( ${#@} > 0 )) && _printline "$@" # # If we're running in a subshell then just exit and let # the parent handle the failures # if [[ -n "$_MAINPID" && $$ != "$_MAINPID" ]]; then log_note "subshell exited: "$_MAINPID exit $exitcode fi if [[ $exitcode == $STF_FAIL ]] ; then _execute_testfail_callbacks fi typeset stack=("${_CLEANUP[@]}") log_onexit "" typeset i=${#stack[@]} while (( i-- )); do typeset cleanup="${stack[i]}" log_note "Performing local cleanup via log_onexit ($cleanup)" $cleanup done exit $exitcode } # Output a formatted line # # $@ - message text function _printline { print "$@" } # Output an error message # # $@ - message text function _printerror { _printline ERROR: "$@" } # Output a success message # # $@ - message text function _printsuccess { _printline SUCCESS: "$@" } # Output logfiles recursively # # $1 - start file # $2 - indicate whether output the start file itself, default as yes. function _recursive_output #logfile { typeset logfile=$1 while [[ -e $logfile ]]; do if [[ -z $2 || $logfile != $1 ]]; then cat $logfile fi rm -f $logfile logfile="$logfile.$$" done } diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index b671af7d8f42..0e552c2680a0 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -1,1063 +1,1063 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * Test the nvpair inputs for the non-legacy zfs ioctl commands. */ boolean_t unexpected_failures; int zfs_fd; const char *active_test; /* * Tracks which zfs_ioc_t commands were tested */ boolean_t ioc_tested[ZFS_IOC_LAST - ZFS_IOC_FIRST]; /* * Legacy ioctls that are skipped (for now) */ static unsigned ioc_skip[] = { ZFS_IOC_POOL_CREATE, ZFS_IOC_POOL_DESTROY, ZFS_IOC_POOL_IMPORT, ZFS_IOC_POOL_EXPORT, ZFS_IOC_POOL_CONFIGS, ZFS_IOC_POOL_STATS, ZFS_IOC_POOL_TRYIMPORT, ZFS_IOC_POOL_SCAN, ZFS_IOC_POOL_FREEZE, ZFS_IOC_POOL_UPGRADE, ZFS_IOC_POOL_GET_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, ZFS_IOC_DESTROY, ZFS_IOC_RENAME, ZFS_IOC_RECV, ZFS_IOC_SEND, ZFS_IOC_INJECT_FAULT, ZFS_IOC_CLEAR_FAULT, ZFS_IOC_INJECT_LIST_NEXT, ZFS_IOC_ERROR_LOG, ZFS_IOC_CLEAR, ZFS_IOC_PROMOTE, ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, ZFS_IOC_POOL_SET_PROPS, ZFS_IOC_POOL_GET_PROPS, ZFS_IOC_SET_FSACL, ZFS_IOC_GET_FSACL, ZFS_IOC_SHARE, ZFS_IOC_INHERIT_PROP, ZFS_IOC_SMB_ACL, ZFS_IOC_USERSPACE_ONE, ZFS_IOC_USERSPACE_MANY, ZFS_IOC_USERSPACE_UPGRADE, ZFS_IOC_OBJSET_RECVD_PROPS, ZFS_IOC_VDEV_SPLIT, ZFS_IOC_NEXT_OBJ, ZFS_IOC_DIFF, ZFS_IOC_TMP_SNAPSHOT, ZFS_IOC_OBJ_TO_STATS, ZFS_IOC_SPACE_WRITTEN, ZFS_IOC_POOL_REGUID, ZFS_IOC_SEND_PROGRESS, ZFS_IOC_EVENTS_NEXT, ZFS_IOC_EVENTS_CLEAR, ZFS_IOC_EVENTS_SEEK, ZFS_IOC_NEXTBOOT, ZFS_IOC_JAIL, ZFS_IOC_UNJAIL, }; #define IOC_INPUT_TEST(ioc, name, req, opt, err) \ IOC_INPUT_TEST_IMPL(ioc, name, req, opt, err, B_FALSE) #define IOC_INPUT_TEST_WILD(ioc, name, req, opt, err) \ IOC_INPUT_TEST_IMPL(ioc, name, req, opt, err, B_TRUE) #define IOC_INPUT_TEST_IMPL(ioc, name, req, opt, err, wild) \ do { \ active_test = __func__ + 5; \ ioc_tested[ioc - ZFS_IOC_FIRST] = B_TRUE; \ lzc_ioctl_test(ioc, name, req, opt, err, wild); \ } while (0) /* * run a zfs ioctl command, verify expected results and log failures */ static void lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected) { zfs_cmd_t zc = {"\0"}; char *packed = NULL; const char *variant; size_t size = 0; int error = 0; switch (expected) { case ZFS_ERR_IOC_ARG_UNAVAIL: variant = "unsupported input"; break; case ZFS_ERR_IOC_ARG_REQUIRED: variant = "missing input"; break; case ZFS_ERR_IOC_ARG_BADTYPE: variant = "invalid input type"; break; default: variant = "valid input"; break; } packed = fnvlist_pack(innvl, &size); (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); zc.zc_name[sizeof (zc.zc_name) - 1] = '\0'; zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; zc.zc_nvlist_src_size = size; zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); zc.zc_nvlist_dst = (uint64_t)(uintptr_t)malloc(zc.zc_nvlist_dst_size); - if (zfs_ioctl_fd(zfs_fd, ioc, &zc) != 0) + if (lzc_ioctl_fd(zfs_fd, ioc, &zc) != 0) error = errno; if (error != expected) { unexpected_failures = B_TRUE; (void) fprintf(stderr, "%s: Unexpected result with %s, " "error %d (expecting %d)\n", active_test, variant, error, expected); } fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zc.zc_nvlist_dst); } /* * Test each ioc for the following ioctl input errors: * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ static int lzc_ioctl_test(zfs_ioc_t ioc, const char *name, nvlist_t *required, nvlist_t *optional, int expected_error, boolean_t wildcard) { nvlist_t *input = fnvlist_alloc(); nvlist_t *future = fnvlist_alloc(); int error = 0; if (required != NULL) { for (nvpair_t *pair = nvlist_next_nvpair(required, NULL); pair != NULL; pair = nvlist_next_nvpair(required, pair)) { fnvlist_add_nvpair(input, pair); } } if (optional != NULL) { for (nvpair_t *pair = nvlist_next_nvpair(optional, NULL); pair != NULL; pair = nvlist_next_nvpair(optional, pair)) { fnvlist_add_nvpair(input, pair); } } /* * Generic input run with 'optional' nvlist pair */ if (!wildcard) fnvlist_add_nvlist(input, "optional", future); lzc_ioctl_run(ioc, name, input, expected_error); if (!wildcard) fnvlist_remove(input, "optional"); /* * Bogus input value */ if (!wildcard) { fnvlist_add_string(input, "bogus_input", "bogus"); lzc_ioctl_run(ioc, name, input, ZFS_ERR_IOC_ARG_UNAVAIL); fnvlist_remove(input, "bogus_input"); } /* * Missing required inputs */ if (required != NULL) { nvlist_t *empty = fnvlist_alloc(); lzc_ioctl_run(ioc, name, empty, ZFS_ERR_IOC_ARG_REQUIRED); nvlist_free(empty); } /* * Wrong nvpair type */ if (required != NULL || optional != NULL) { /* * switch the type of one of the input pairs */ for (nvpair_t *pair = nvlist_next_nvpair(input, NULL); pair != NULL; pair = nvlist_next_nvpair(input, pair)) { char pname[MAXNAMELEN]; data_type_t ptype; strlcpy(pname, nvpair_name(pair), sizeof (pname)); pname[sizeof (pname) - 1] = '\0'; ptype = nvpair_type(pair); fnvlist_remove_nvpair(input, pair); switch (ptype) { case DATA_TYPE_STRING: fnvlist_add_uint64(input, pname, 42); break; default: fnvlist_add_string(input, pname, "bogus"); break; } } lzc_ioctl_run(ioc, name, input, ZFS_ERR_IOC_ARG_BADTYPE); } nvlist_free(future); nvlist_free(input); return (error); } static void test_pool_sync(const char *pool) { nvlist_t *required = fnvlist_alloc(); fnvlist_add_boolean_value(required, "force", B_TRUE); IOC_INPUT_TEST(ZFS_IOC_POOL_SYNC, pool, required, NULL, 0); nvlist_free(required); } static void test_pool_reopen(const char *pool) { nvlist_t *optional = fnvlist_alloc(); fnvlist_add_boolean_value(optional, "scrub_restart", B_FALSE); IOC_INPUT_TEST(ZFS_IOC_POOL_REOPEN, pool, NULL, optional, 0); nvlist_free(optional); } static void test_pool_checkpoint(const char *pool) { IOC_INPUT_TEST(ZFS_IOC_POOL_CHECKPOINT, pool, NULL, NULL, 0); } static void test_pool_discard_checkpoint(const char *pool) { int err = lzc_pool_checkpoint(pool); if (err == 0 || err == ZFS_ERR_CHECKPOINT_EXISTS) IOC_INPUT_TEST(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, NULL, NULL, 0); } static void test_log_history(const char *pool) { nvlist_t *required = fnvlist_alloc(); fnvlist_add_string(required, "message", "input check"); IOC_INPUT_TEST(ZFS_IOC_LOG_HISTORY, pool, required, NULL, 0); nvlist_free(required); } static void test_create(const char *pool) { char dataset[MAXNAMELEN + 32]; (void) snprintf(dataset, sizeof (dataset), "%s/create-fs", pool); nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); fnvlist_add_int32(required, "type", DMU_OST_ZFS); fnvlist_add_uint64(props, "recordsize", 8192); fnvlist_add_nvlist(optional, "props", props); IOC_INPUT_TEST(ZFS_IOC_CREATE, dataset, required, optional, 0); nvlist_free(required); nvlist_free(optional); } static void test_snapshot(const char *pool, const char *snapshot) { nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *snaps = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); fnvlist_add_boolean(snaps, snapshot); fnvlist_add_nvlist(required, "snaps", snaps); fnvlist_add_string(props, "org.openzfs:launch", "September 17th, 2013"); fnvlist_add_nvlist(optional, "props", props); IOC_INPUT_TEST(ZFS_IOC_SNAPSHOT, pool, required, optional, 0); nvlist_free(props); nvlist_free(snaps); nvlist_free(optional); nvlist_free(required); } static void test_space_snaps(const char *snapshot) { nvlist_t *required = fnvlist_alloc(); fnvlist_add_string(required, "firstsnap", snapshot); IOC_INPUT_TEST(ZFS_IOC_SPACE_SNAPS, snapshot, required, NULL, 0); nvlist_free(required); } static void test_destroy_snaps(const char *pool, const char *snapshot) { nvlist_t *required = fnvlist_alloc(); nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, snapshot); fnvlist_add_nvlist(required, "snaps", snaps); IOC_INPUT_TEST(ZFS_IOC_DESTROY_SNAPS, pool, required, NULL, 0); nvlist_free(snaps); nvlist_free(required); } static void test_bookmark(const char *pool, const char *snapshot, const char *bookmark) { nvlist_t *required = fnvlist_alloc(); fnvlist_add_string(required, bookmark, snapshot); IOC_INPUT_TEST_WILD(ZFS_IOC_BOOKMARK, pool, required, NULL, 0); nvlist_free(required); } static void test_get_bookmarks(const char *dataset) { nvlist_t *optional = fnvlist_alloc(); fnvlist_add_boolean(optional, "guid"); fnvlist_add_boolean(optional, "createtxg"); fnvlist_add_boolean(optional, "creation"); IOC_INPUT_TEST_WILD(ZFS_IOC_GET_BOOKMARKS, dataset, NULL, optional, 0); nvlist_free(optional); } static void test_destroy_bookmarks(const char *pool, const char *bookmark) { nvlist_t *required = fnvlist_alloc(); fnvlist_add_boolean(required, bookmark); IOC_INPUT_TEST_WILD(ZFS_IOC_DESTROY_BOOKMARKS, pool, required, NULL, 0); nvlist_free(required); } static void test_clone(const char *snapshot, const char *clone) { nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); fnvlist_add_string(required, "origin", snapshot); IOC_INPUT_TEST(ZFS_IOC_CLONE, clone, required, NULL, 0); nvlist_free(props); nvlist_free(optional); nvlist_free(required); } static void test_rollback(const char *dataset, const char *snapshot) { nvlist_t *optional = fnvlist_alloc(); fnvlist_add_string(optional, "target", snapshot); IOC_INPUT_TEST(ZFS_IOC_ROLLBACK, dataset, NULL, optional, B_FALSE); nvlist_free(optional); } static void test_hold(const char *pool, const char *snapshot) { nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *holds = fnvlist_alloc(); fnvlist_add_string(holds, snapshot, "libzfs_check_hold"); fnvlist_add_nvlist(required, "holds", holds); fnvlist_add_int32(optional, "cleanup_fd", zfs_fd); IOC_INPUT_TEST(ZFS_IOC_HOLD, pool, required, optional, 0); nvlist_free(holds); nvlist_free(optional); nvlist_free(required); } static void test_get_holds(const char *snapshot) { IOC_INPUT_TEST(ZFS_IOC_GET_HOLDS, snapshot, NULL, NULL, 0); } static void test_release(const char *pool, const char *snapshot) { nvlist_t *required = fnvlist_alloc(); nvlist_t *release = fnvlist_alloc(); fnvlist_add_boolean(release, "libzfs_check_hold"); fnvlist_add_nvlist(required, snapshot, release); IOC_INPUT_TEST_WILD(ZFS_IOC_RELEASE, pool, required, NULL, 0); nvlist_free(release); nvlist_free(required); } static void test_send_new(const char *snapshot, int fd) { nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); fnvlist_add_int32(required, "fd", fd); fnvlist_add_boolean(optional, "largeblockok"); fnvlist_add_boolean(optional, "embedok"); fnvlist_add_boolean(optional, "compressok"); fnvlist_add_boolean(optional, "rawok"); /* * TODO - Resumable send is harder to set up. So we currently * ignore testing for that variant. */ #if 0 fnvlist_add_string(optional, "fromsnap", from); fnvlist_add_uint64(optional, "resume_object", resumeobj); fnvlist_add_uint64(optional, "resume_offset", offset); fnvlist_add_boolean(optional, "savedok"); #endif IOC_INPUT_TEST(ZFS_IOC_SEND_NEW, snapshot, required, optional, 0); nvlist_free(optional); nvlist_free(required); } static void test_recv_new(const char *dataset, int fd) { dmu_replay_record_t drr = { 0 }; nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); char snapshot[MAXNAMELEN + 32]; ssize_t count; int cleanup_fd = open(ZFS_DEV, O_RDWR); (void) snprintf(snapshot, sizeof (snapshot), "%s@replicant", dataset); count = pread(fd, &drr, sizeof (drr), 0); if (count != sizeof (drr)) { (void) fprintf(stderr, "could not read stream: %s\n", strerror(errno)); } fnvlist_add_string(required, "snapname", snapshot); fnvlist_add_byte_array(required, "begin_record", (uchar_t *)&drr, sizeof (drr)); fnvlist_add_int32(required, "input_fd", fd); fnvlist_add_string(props, "org.openzfs:launch", "September 17th, 2013"); fnvlist_add_nvlist(optional, "localprops", props); fnvlist_add_boolean(optional, "force"); fnvlist_add_int32(optional, "cleanup_fd", cleanup_fd); /* * TODO - Resumable receive is harder to set up. So we currently * ignore testing for one. */ #if 0 fnvlist_add_nvlist(optional, "props", recvdprops); fnvlist_add_string(optional, "origin", origin); fnvlist_add_boolean(optional, "resumable"); fnvlist_add_uint64(optional, "action_handle", *action_handle); #endif IOC_INPUT_TEST(ZFS_IOC_RECV_NEW, dataset, required, optional, ZFS_ERR_STREAM_TRUNCATED); nvlist_free(props); nvlist_free(optional); nvlist_free(required); (void) close(cleanup_fd); } static void test_send_space(const char *snapshot1, const char *snapshot2) { nvlist_t *optional = fnvlist_alloc(); fnvlist_add_string(optional, "from", snapshot1); fnvlist_add_boolean(optional, "largeblockok"); fnvlist_add_boolean(optional, "embedok"); fnvlist_add_boolean(optional, "compressok"); fnvlist_add_boolean(optional, "rawok"); IOC_INPUT_TEST(ZFS_IOC_SEND_SPACE, snapshot2, NULL, optional, 0); nvlist_free(optional); } static void test_remap(const char *dataset) { IOC_INPUT_TEST(ZFS_IOC_REMAP, dataset, NULL, NULL, 0); } static void test_channel_program(const char *pool) { const char *program = "arg = ...\n" "argv = arg[\"argv\"]\n" "return argv[1]"; char *const argv[1] = { "Hello World!" }; nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *args = fnvlist_alloc(); fnvlist_add_string(required, "program", program); fnvlist_add_string_array(args, "argv", argv, 1); fnvlist_add_nvlist(required, "arg", args); fnvlist_add_boolean_value(optional, "sync", B_TRUE); fnvlist_add_uint64(optional, "instrlimit", 1000 * 1000); fnvlist_add_uint64(optional, "memlimit", 8192 * 1024); IOC_INPUT_TEST(ZFS_IOC_CHANNEL_PROGRAM, pool, required, optional, 0); nvlist_free(args); nvlist_free(optional); nvlist_free(required); } #define WRAPPING_KEY_LEN 32 static void test_load_key(const char *dataset) { nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *hidden = fnvlist_alloc(); uint8_t keydata[WRAPPING_KEY_LEN] = {0}; fnvlist_add_uint8_array(hidden, "wkeydata", keydata, sizeof (keydata)); fnvlist_add_nvlist(required, "hidden_args", hidden); fnvlist_add_boolean(optional, "noop"); IOC_INPUT_TEST(ZFS_IOC_LOAD_KEY, dataset, required, optional, EINVAL); nvlist_free(hidden); nvlist_free(optional); nvlist_free(required); } static void test_change_key(const char *dataset) { IOC_INPUT_TEST(ZFS_IOC_CHANGE_KEY, dataset, NULL, NULL, EINVAL); } static void test_unload_key(const char *dataset) { IOC_INPUT_TEST(ZFS_IOC_UNLOAD_KEY, dataset, NULL, NULL, EACCES); } static void test_vdev_initialize(const char *pool) { nvlist_t *required = fnvlist_alloc(); nvlist_t *vdev_guids = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef); fnvlist_add_uint64(required, ZPOOL_INITIALIZE_COMMAND, POOL_INITIALIZE_START); fnvlist_add_nvlist(required, ZPOOL_INITIALIZE_VDEVS, vdev_guids); IOC_INPUT_TEST(ZFS_IOC_POOL_INITIALIZE, pool, required, NULL, EINVAL); nvlist_free(vdev_guids); nvlist_free(required); } static void test_vdev_trim(const char *pool) { nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); nvlist_t *vdev_guids = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef); fnvlist_add_uint64(required, ZPOOL_TRIM_COMMAND, POOL_TRIM_START); fnvlist_add_nvlist(required, ZPOOL_TRIM_VDEVS, vdev_guids); fnvlist_add_uint64(optional, ZPOOL_TRIM_RATE, 1ULL << 30); fnvlist_add_boolean_value(optional, ZPOOL_TRIM_SECURE, B_TRUE); IOC_INPUT_TEST(ZFS_IOC_POOL_TRIM, pool, required, optional, EINVAL); nvlist_free(vdev_guids); nvlist_free(optional); nvlist_free(required); } static int zfs_destroy(const char *dataset) { zfs_cmd_t zc = {"\0"}; int err; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); zc.zc_name[sizeof (zc.zc_name) - 1] = '\0'; - err = zfs_ioctl_fd(zfs_fd, ZFS_IOC_DESTROY, &zc); + err = lzc_ioctl_fd(zfs_fd, ZFS_IOC_DESTROY, &zc); return (err == 0 ? 0 : errno); } static void test_redact(const char *snapshot1, const char *snapshot2) { nvlist_t *required = fnvlist_alloc(); nvlist_t *snapnv = fnvlist_alloc(); char bookmark[MAXNAMELEN + 32]; fnvlist_add_string(required, "bookname", "testbookmark"); fnvlist_add_boolean(snapnv, snapshot2); fnvlist_add_nvlist(required, "snapnv", snapnv); IOC_INPUT_TEST(ZFS_IOC_REDACT, snapshot1, required, NULL, 0); nvlist_free(snapnv); nvlist_free(required); strlcpy(bookmark, snapshot1, sizeof (bookmark)); *strchr(bookmark, '@') = '\0'; strlcat(bookmark, "#testbookmark", sizeof (bookmark) - strlen(bookmark)); zfs_destroy(bookmark); } static void test_get_bookmark_props(const char *bookmark) { IOC_INPUT_TEST(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, NULL, NULL, 0); } static void test_wait(const char *pool) { nvlist_t *required = fnvlist_alloc(); nvlist_t *optional = fnvlist_alloc(); fnvlist_add_int32(required, "wait_activity", 2); fnvlist_add_uint64(optional, "wait_tag", 0xdeadbeefdeadbeef); IOC_INPUT_TEST(ZFS_IOC_WAIT, pool, required, optional, EINVAL); nvlist_free(required); nvlist_free(optional); } static void test_wait_fs(const char *dataset) { nvlist_t *required = fnvlist_alloc(); fnvlist_add_int32(required, "wait_activity", 2); IOC_INPUT_TEST(ZFS_IOC_WAIT_FS, dataset, required, NULL, EINVAL); nvlist_free(required); } static void test_get_bootenv(const char *pool) { IOC_INPUT_TEST(ZFS_IOC_GET_BOOTENV, pool, NULL, NULL, 0); } static void test_set_bootenv(const char *pool) { nvlist_t *required = fnvlist_alloc(); fnvlist_add_uint64(required, "version", VB_RAW); fnvlist_add_string(required, GRUB_ENVMAP, "test"); IOC_INPUT_TEST_WILD(ZFS_IOC_SET_BOOTENV, pool, required, NULL, 0); nvlist_free(required); } static void zfs_ioc_input_tests(const char *pool) { char filepath[] = "/tmp/ioc_test_file_XXXXXX"; char dataset[ZFS_MAX_DATASET_NAME_LEN]; char snapbase[ZFS_MAX_DATASET_NAME_LEN + 32]; char snapshot[ZFS_MAX_DATASET_NAME_LEN + 32]; char bookmark[ZFS_MAX_DATASET_NAME_LEN + 32]; char backup[ZFS_MAX_DATASET_NAME_LEN]; char clone[ZFS_MAX_DATASET_NAME_LEN]; char clonesnap[ZFS_MAX_DATASET_NAME_LEN + 32]; int tmpfd, err; /* * Setup names and create a working dataset */ (void) snprintf(dataset, sizeof (dataset), "%s/test-fs", pool); (void) snprintf(snapbase, sizeof (snapbase), "%s@snapbase", dataset); (void) snprintf(snapshot, sizeof (snapshot), "%s@snapshot", dataset); (void) snprintf(bookmark, sizeof (bookmark), "%s#bookmark", dataset); (void) snprintf(clone, sizeof (clone), "%s/test-fs-clone", pool); (void) snprintf(clonesnap, sizeof (clonesnap), "%s@snap", clone); (void) snprintf(backup, sizeof (backup), "%s/backup", pool); err = lzc_create(dataset, LZC_DATSET_TYPE_ZFS, NULL, NULL, -1); if (err) { (void) fprintf(stderr, "could not create '%s': %s\n", dataset, strerror(errno)); exit(2); } tmpfd = mkstemp(filepath); if (tmpfd < 0) { (void) fprintf(stderr, "could not create '%s': %s\n", filepath, strerror(errno)); exit(2); } /* * run a test for each ioctl * Note that some test build on previous test operations */ test_pool_sync(pool); test_pool_reopen(pool); test_pool_checkpoint(pool); test_pool_discard_checkpoint(pool); test_log_history(pool); test_create(dataset); test_snapshot(pool, snapbase); test_snapshot(pool, snapshot); test_space_snaps(snapshot); test_send_space(snapbase, snapshot); test_send_new(snapshot, tmpfd); test_recv_new(backup, tmpfd); test_bookmark(pool, snapshot, bookmark); test_get_bookmarks(dataset); test_get_bookmark_props(bookmark); test_destroy_bookmarks(pool, bookmark); test_hold(pool, snapshot); test_get_holds(snapshot); test_release(pool, snapshot); test_clone(snapshot, clone); test_snapshot(pool, clonesnap); test_redact(snapshot, clonesnap); zfs_destroy(clonesnap); zfs_destroy(clone); test_rollback(dataset, snapshot); test_destroy_snaps(pool, snapshot); test_destroy_snaps(pool, snapbase); test_remap(dataset); test_channel_program(pool); test_load_key(dataset); test_change_key(dataset); test_unload_key(dataset); test_vdev_initialize(pool); test_vdev_trim(pool); test_wait(pool); test_wait_fs(dataset); test_set_bootenv(pool); test_get_bootenv(pool); /* * cleanup */ zfs_cmd_t zc = {"\0"}; nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, snapshot); (void) lzc_destroy_snaps(snaps, B_FALSE, NULL); nvlist_free(snaps); (void) zfs_destroy(dataset); (void) zfs_destroy(backup); (void) close(tmpfd); (void) unlink(filepath); /* * All the unused slots should yield ZFS_ERR_IOC_CMD_UNAVAIL */ for (int i = 0; i < ARRAY_SIZE(ioc_skip); i++) { if (ioc_tested[ioc_skip[i] - ZFS_IOC_FIRST]) (void) fprintf(stderr, "cmd %d tested, not skipped!\n", (int)(ioc_skip[i] - ZFS_IOC_FIRST)); ioc_tested[ioc_skip[i] - ZFS_IOC_FIRST] = B_TRUE; } (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); zc.zc_name[sizeof (zc.zc_name) - 1] = '\0'; for (unsigned ioc = ZFS_IOC_FIRST; ioc < ZFS_IOC_LAST; ioc++) { unsigned cmd = ioc - ZFS_IOC_FIRST; if (ioc_tested[cmd]) continue; - if (zfs_ioctl_fd(zfs_fd, ioc, &zc) != 0 && + if (lzc_ioctl_fd(zfs_fd, ioc, &zc) != 0 && errno != ZFS_ERR_IOC_CMD_UNAVAIL) { (void) fprintf(stderr, "cmd %d is missing a test case " "(%d)\n", cmd, errno); } } } enum zfs_ioc_ref { #ifdef __FreeBSD__ ZFS_IOC_BASE = 0, #else ZFS_IOC_BASE = ('Z' << 8), #endif ZFS_IOC_PLATFORM_BASE = ZFS_IOC_BASE + 0x80, }; /* * Canonical reference check of /dev/zfs ioctl numbers. * These cannot change and new ioctl numbers must be appended. */ static boolean_t validate_ioc_values(void) { boolean_t result = B_TRUE; #define CHECK(expr) do { \ if (!(expr)) { \ result = B_FALSE; \ fprintf(stderr, "(%s) === FALSE\n", #expr); \ } \ } while (0) CHECK(ZFS_IOC_BASE + 0 == ZFS_IOC_POOL_CREATE); CHECK(ZFS_IOC_BASE + 1 == ZFS_IOC_POOL_DESTROY); CHECK(ZFS_IOC_BASE + 2 == ZFS_IOC_POOL_IMPORT); CHECK(ZFS_IOC_BASE + 3 == ZFS_IOC_POOL_EXPORT); CHECK(ZFS_IOC_BASE + 4 == ZFS_IOC_POOL_CONFIGS); CHECK(ZFS_IOC_BASE + 5 == ZFS_IOC_POOL_STATS); CHECK(ZFS_IOC_BASE + 6 == ZFS_IOC_POOL_TRYIMPORT); CHECK(ZFS_IOC_BASE + 7 == ZFS_IOC_POOL_SCAN); CHECK(ZFS_IOC_BASE + 8 == ZFS_IOC_POOL_FREEZE); CHECK(ZFS_IOC_BASE + 9 == ZFS_IOC_POOL_UPGRADE); CHECK(ZFS_IOC_BASE + 10 == ZFS_IOC_POOL_GET_HISTORY); CHECK(ZFS_IOC_BASE + 11 == ZFS_IOC_VDEV_ADD); CHECK(ZFS_IOC_BASE + 12 == ZFS_IOC_VDEV_REMOVE); CHECK(ZFS_IOC_BASE + 13 == ZFS_IOC_VDEV_SET_STATE); CHECK(ZFS_IOC_BASE + 14 == ZFS_IOC_VDEV_ATTACH); CHECK(ZFS_IOC_BASE + 15 == ZFS_IOC_VDEV_DETACH); CHECK(ZFS_IOC_BASE + 16 == ZFS_IOC_VDEV_SETPATH); CHECK(ZFS_IOC_BASE + 17 == ZFS_IOC_VDEV_SETFRU); CHECK(ZFS_IOC_BASE + 18 == ZFS_IOC_OBJSET_STATS); CHECK(ZFS_IOC_BASE + 19 == ZFS_IOC_OBJSET_ZPLPROPS); CHECK(ZFS_IOC_BASE + 20 == ZFS_IOC_DATASET_LIST_NEXT); CHECK(ZFS_IOC_BASE + 21 == ZFS_IOC_SNAPSHOT_LIST_NEXT); CHECK(ZFS_IOC_BASE + 22 == ZFS_IOC_SET_PROP); CHECK(ZFS_IOC_BASE + 23 == ZFS_IOC_CREATE); CHECK(ZFS_IOC_BASE + 24 == ZFS_IOC_DESTROY); CHECK(ZFS_IOC_BASE + 25 == ZFS_IOC_ROLLBACK); CHECK(ZFS_IOC_BASE + 26 == ZFS_IOC_RENAME); CHECK(ZFS_IOC_BASE + 27 == ZFS_IOC_RECV); CHECK(ZFS_IOC_BASE + 28 == ZFS_IOC_SEND); CHECK(ZFS_IOC_BASE + 29 == ZFS_IOC_INJECT_FAULT); CHECK(ZFS_IOC_BASE + 30 == ZFS_IOC_CLEAR_FAULT); CHECK(ZFS_IOC_BASE + 31 == ZFS_IOC_INJECT_LIST_NEXT); CHECK(ZFS_IOC_BASE + 32 == ZFS_IOC_ERROR_LOG); CHECK(ZFS_IOC_BASE + 33 == ZFS_IOC_CLEAR); CHECK(ZFS_IOC_BASE + 34 == ZFS_IOC_PROMOTE); CHECK(ZFS_IOC_BASE + 35 == ZFS_IOC_SNAPSHOT); CHECK(ZFS_IOC_BASE + 36 == ZFS_IOC_DSOBJ_TO_DSNAME); CHECK(ZFS_IOC_BASE + 37 == ZFS_IOC_OBJ_TO_PATH); CHECK(ZFS_IOC_BASE + 38 == ZFS_IOC_POOL_SET_PROPS); CHECK(ZFS_IOC_BASE + 39 == ZFS_IOC_POOL_GET_PROPS); CHECK(ZFS_IOC_BASE + 40 == ZFS_IOC_SET_FSACL); CHECK(ZFS_IOC_BASE + 41 == ZFS_IOC_GET_FSACL); CHECK(ZFS_IOC_BASE + 42 == ZFS_IOC_SHARE); CHECK(ZFS_IOC_BASE + 43 == ZFS_IOC_INHERIT_PROP); CHECK(ZFS_IOC_BASE + 44 == ZFS_IOC_SMB_ACL); CHECK(ZFS_IOC_BASE + 45 == ZFS_IOC_USERSPACE_ONE); CHECK(ZFS_IOC_BASE + 46 == ZFS_IOC_USERSPACE_MANY); CHECK(ZFS_IOC_BASE + 47 == ZFS_IOC_USERSPACE_UPGRADE); CHECK(ZFS_IOC_BASE + 48 == ZFS_IOC_HOLD); CHECK(ZFS_IOC_BASE + 49 == ZFS_IOC_RELEASE); CHECK(ZFS_IOC_BASE + 50 == ZFS_IOC_GET_HOLDS); CHECK(ZFS_IOC_BASE + 51 == ZFS_IOC_OBJSET_RECVD_PROPS); CHECK(ZFS_IOC_BASE + 52 == ZFS_IOC_VDEV_SPLIT); CHECK(ZFS_IOC_BASE + 53 == ZFS_IOC_NEXT_OBJ); CHECK(ZFS_IOC_BASE + 54 == ZFS_IOC_DIFF); CHECK(ZFS_IOC_BASE + 55 == ZFS_IOC_TMP_SNAPSHOT); CHECK(ZFS_IOC_BASE + 56 == ZFS_IOC_OBJ_TO_STATS); CHECK(ZFS_IOC_BASE + 57 == ZFS_IOC_SPACE_WRITTEN); CHECK(ZFS_IOC_BASE + 58 == ZFS_IOC_SPACE_SNAPS); CHECK(ZFS_IOC_BASE + 59 == ZFS_IOC_DESTROY_SNAPS); CHECK(ZFS_IOC_BASE + 60 == ZFS_IOC_POOL_REGUID); CHECK(ZFS_IOC_BASE + 61 == ZFS_IOC_POOL_REOPEN); CHECK(ZFS_IOC_BASE + 62 == ZFS_IOC_SEND_PROGRESS); CHECK(ZFS_IOC_BASE + 63 == ZFS_IOC_LOG_HISTORY); CHECK(ZFS_IOC_BASE + 64 == ZFS_IOC_SEND_NEW); CHECK(ZFS_IOC_BASE + 65 == ZFS_IOC_SEND_SPACE); CHECK(ZFS_IOC_BASE + 66 == ZFS_IOC_CLONE); CHECK(ZFS_IOC_BASE + 67 == ZFS_IOC_BOOKMARK); CHECK(ZFS_IOC_BASE + 68 == ZFS_IOC_GET_BOOKMARKS); CHECK(ZFS_IOC_BASE + 69 == ZFS_IOC_DESTROY_BOOKMARKS); CHECK(ZFS_IOC_BASE + 70 == ZFS_IOC_RECV_NEW); CHECK(ZFS_IOC_BASE + 71 == ZFS_IOC_POOL_SYNC); CHECK(ZFS_IOC_BASE + 72 == ZFS_IOC_CHANNEL_PROGRAM); CHECK(ZFS_IOC_BASE + 73 == ZFS_IOC_LOAD_KEY); CHECK(ZFS_IOC_BASE + 74 == ZFS_IOC_UNLOAD_KEY); CHECK(ZFS_IOC_BASE + 75 == ZFS_IOC_CHANGE_KEY); CHECK(ZFS_IOC_BASE + 76 == ZFS_IOC_REMAP); CHECK(ZFS_IOC_BASE + 77 == ZFS_IOC_POOL_CHECKPOINT); CHECK(ZFS_IOC_BASE + 78 == ZFS_IOC_POOL_DISCARD_CHECKPOINT); CHECK(ZFS_IOC_BASE + 79 == ZFS_IOC_POOL_INITIALIZE); CHECK(ZFS_IOC_BASE + 80 == ZFS_IOC_POOL_TRIM); CHECK(ZFS_IOC_BASE + 81 == ZFS_IOC_REDACT); CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS); CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT); CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS); CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT); CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR); CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK); CHECK(ZFS_IOC_PLATFORM_BASE + 4 == ZFS_IOC_NEXTBOOT); CHECK(ZFS_IOC_PLATFORM_BASE + 5 == ZFS_IOC_JAIL); CHECK(ZFS_IOC_PLATFORM_BASE + 6 == ZFS_IOC_UNJAIL); CHECK(ZFS_IOC_PLATFORM_BASE + 7 == ZFS_IOC_SET_BOOTENV); CHECK(ZFS_IOC_PLATFORM_BASE + 8 == ZFS_IOC_GET_BOOTENV); #undef CHECK return (result); } int main(int argc, const char *argv[]) { if (argc != 2) { (void) fprintf(stderr, "usage: %s \n", argv[0]); exit(2); } if (!validate_ioc_values()) { (void) fprintf(stderr, "WARNING: zfs_ioc_t has binary " "incompatible command values\n"); exit(3); } (void) libzfs_core_init(); zfs_fd = open(ZFS_DEV, O_RDWR); if (zfs_fd < 0) { (void) fprintf(stderr, "open: %s\n", strerror(errno)); libzfs_core_fini(); exit(2); } zfs_ioc_input_tests(argv[1]); (void) close(zfs_fd); libzfs_core_fini(); return (unexpected_failures); } diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/Makefile.am index 8d5885e08447..abae69dea8c7 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/Makefile.am @@ -1,8 +1,6 @@ include $(top_srcdir)/config/Rules.am pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin pkgexec_PROGRAMS = mkbusy mkbusy_SOURCES = mkbusy.c - -mkbusy_LDADD = $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/mkbusy.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/mkbusy.c index 58ea1d07a40b..e1cbd95cd118 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/mkbusy.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/mkbusy/mkbusy.c @@ -1,153 +1,167 @@ /* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright (c) 2012 by Delphix. All rights reserved. */ /* * Make a directory busy. If the argument is an existing file or directory, * simply open it directly and pause. If not, verify that the parent directory * exists, and create a new file in that directory. */ #include #include #include #include #include #include #include #include #include #include -#include static __attribute__((noreturn)) void usage(char *progname) { (void) fprintf(stderr, "Usage: %s \n", progname); exit(1); } static __attribute__((noreturn)) void fail(char *err) { perror(err); exit(1); } static void daemonize(void) { pid_t pid; if ((pid = fork()) < 0) { fail("fork"); } else if (pid != 0) { (void) fprintf(stdout, "%ld\n", (long)pid); exit(0); } (void) setsid(); (void) close(0); (void) close(1); (void) close(2); } + +static const char * +get_basename(const char *path) +{ + const char *bn = strrchr(path, '/'); + return (bn ? bn + 1 : path); +} + +static ssize_t +get_dirnamelen(const char *path) +{ + const char *end = strrchr(path, '/'); + return (end ? end - path : -1); +} + int main(int argc, char *argv[]) { int c; boolean_t isdir = B_FALSE; struct stat sbuf; char *fpath = NULL; char *prog = argv[0]; while ((c = getopt(argc, argv, "")) != -1) { switch (c) { default: usage(prog); } } argc -= optind; argv += optind; if (argc != 1) usage(prog); if (stat(argv[0], &sbuf) != 0) { char *arg; const char *dname, *fname; size_t arglen; ssize_t dnamelen; /* * The argument supplied doesn't exist. Copy the path, and * remove the trailing slash if present. */ if ((arg = strdup(argv[0])) == NULL) fail("strdup"); arglen = strlen(arg); if (arg[arglen - 1] == '/') arg[arglen - 1] = '\0'; /* Get the directory and file names. */ - fname = zfs_basename(arg); + fname = get_basename(arg); dname = arg; - if ((dnamelen = zfs_dirnamelen(arg)) != -1) + if ((dnamelen = get_dirnamelen(arg)) != -1) arg[dnamelen] = '\0'; else dname = "."; /* The directory portion of the path must exist */ if (stat(dname, &sbuf) != 0 || !(sbuf.st_mode & S_IFDIR)) usage(prog); if (asprintf(&fpath, "%s/%s", dname, fname) == -1) fail("asprintf"); free(arg); } else switch (sbuf.st_mode & S_IFMT) { case S_IFDIR: isdir = B_TRUE; fallthrough; case S_IFLNK: case S_IFCHR: case S_IFBLK: if ((fpath = strdup(argv[0])) == NULL) fail("strdup"); break; default: usage(prog); } if (!isdir) { int fd; if ((fd = open(fpath, O_CREAT | O_RDWR, 0600)) < 0) fail("open"); } else { DIR *dp; if ((dp = opendir(fpath)) == NULL) fail("opendir"); } free(fpath); daemonize(); (void) pause(); return (0); } diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib index bcba8ee759c9..bf70952904bb 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib @@ -1,654 +1,654 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # Copyright (c) 2012, 2019 by Delphix. All rights reserved. # Copyright 2016 Nexenta Systems, Inc. # Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. # Copyright (c) 2017 Datto Inc. # Copyright (c) 2017 Open-E, Inc. All Rights Reserved. # Copyright 2019 Richard Elling # # # Returns SCSI host number for the given disk # function get_scsi_host #disk { typeset disk=$1 ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1 } # # Cause a scan of all scsi host adapters by default # # $1 optional host number # function scan_scsi_hosts { typeset hostnum=${1} if is_linux; then if [[ -z $hostnum ]]; then for host in /sys/class/scsi_host/host*; do log_must eval "echo '- - -' > $host/scan" done else log_must eval \ "echo /sys/class/scsi_host/host$hostnum/scan" \ > /dev/null log_must eval \ "echo '- - -' > /sys/class/scsi_host/host$hostnum/scan" fi fi } # # Wait for newly created block devices to have their minors created. # Additional arguments can be passed to udevadm trigger, with the expected # arguments to typically be a block device pathname. This is useful when # checking waiting on a specific device to settle rather than triggering # all devices and waiting for them all to settle. # # The udevadm settle timeout can be 120 or 180 seconds by default for # some distros. If a long delay is experienced, it could be due to some # strangeness in a malfunctioning device that isn't related to the devices # under test. To help debug this condition, a notice is given if settle takes # too long. # # Note: there is no meaningful return code if udevadm fails. Consumers # should not expect a return code (do not call as argument to log_must) # function block_device_wait { if is_linux; then - udevadm trigger $* + udevadm trigger $* 2>/dev/null typeset start=$SECONDS udevadm settle typeset elapsed=$((SECONDS - start)) [[ $elapsed > 60 ]] && \ log_note udevadm settle time too long: $elapsed elif is_freebsd; then if [[ ${#@} -eq 0 ]]; then # Do something that has to go through the geom event # queue to complete. sysctl kern.geom.conftxt >/dev/null return fi fi # Poll for the given paths to appear, but give up eventually. typeset -i i for (( i = 0; i < 5; ++i )); do typeset missing=false typeset dev for dev in "${@}"; do if ! [[ -e $dev ]]; then missing=true break fi done if ! $missing; then break fi sleep ${#@} done } # # Check if the given device is physical device # function is_physical_device #device { typeset device=${1#$DEV_DSKDIR/} device=${device#$DEV_RDSKDIR/} if is_linux; then is_disk_device "$DEV_DSKDIR/$device" && \ [[ -f /sys/module/loop/parameters/max_part ]] return $? elif is_freebsd; then is_disk_device "$DEV_DSKDIR/$device" && \ echo $device | egrep -q \ -e '^a?da[0-9]+$' \ -e '^md[0-9]+$' \ -e '^mfid[0-9]+$' \ -e '^nda[0-9]+$' \ -e '^nvd[0-9]+$' \ -e '^vtbd[0-9]+$' return $? else echo $device | egrep "^c[0-F]+([td][0-F]+)+$" > /dev/null 2>&1 return $? fi } # # Check if the given device is a real device (ie SCSI device) # function is_real_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \ egrep disk >/dev/null return $? fi } # # Check if the given device is a loop device # function is_loop_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \ egrep loop >/dev/null return $? fi } # # Linux: # Check if the given device is a multipath device and if there is a symbolic # link to a device mapper and to a disk # Currently no support for dm devices alone without multipath # # FreeBSD: # Check if the given device is a gmultipath device. # # Others: # No multipath detection. # function is_mpath_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_MPATHDIR/$disk -o TYPE 2>/dev/null | \ egrep mpath >/dev/null if (($? == 0)); then readlink $DEV_MPATHDIR/$disk > /dev/null 2>&1 return $? else return $? fi elif is_freebsd; then is_disk_device $DEV_MPATHDIR/$disk else false fi } # # Check if the given path is the appropriate sort of device special node. # function is_disk_device #path { typeset path=$1 if is_freebsd; then # FreeBSD doesn't have block devices, only character devices. test -c $path else test -b $path fi } # Set the slice prefix for disk partitioning depending # on whether the device is a real, multipath, or loop device. # Currently all disks have to be of the same type, so only # checks first disk to determine slice prefix. # function set_slice_prefix { typeset disk typeset -i i=0 if is_linux; then while (( i < $DISK_ARRAY_NUM )); do disk="$(echo $DISKS | nawk '{print $(i + 1)}')" if ( is_mpath_device $disk ) && [[ -z $(echo $disk | awk 'substr($1,18,1)\ ~ /^[[:digit:]]+$/') ]] || ( is_real_device $disk ); then export SLICE_PREFIX="" return 0 elif ( is_mpath_device $disk || is_loop_device \ $disk ); then export SLICE_PREFIX="p" return 0 else log_fail "$disk not supported for partitioning." fi (( i = i + 1)) done fi } # # Set the directory path of the listed devices in $DISK_ARRAY_NUM # Currently all disks have to be of the same type, so only # checks first disk to determine device directory # default = /dev (linux) # real disk = /dev (linux) # multipath device = /dev/mapper (linux) # function set_device_dir { typeset disk typeset -i i=0 if is_linux; then while (( i < $DISK_ARRAY_NUM )); do disk="$(echo $DISKS | nawk '{print $(i + 1)}')" if is_mpath_device $disk; then export DEV_DSKDIR=$DEV_MPATHDIR return 0 else export DEV_DSKDIR=$DEV_RDSKDIR return 0 fi (( i = i + 1)) done else export DEV_DSKDIR=$DEV_RDSKDIR fi } # # Get the directory path of given device # function get_device_dir #device { typeset device=$1 if ! is_freebsd && ! is_physical_device $device; then if [[ $device != "/" ]]; then device=${device%/*} fi if is_disk_device "$DEV_DSKDIR/$device"; then device="$DEV_DSKDIR" fi echo $device else echo "$DEV_DSKDIR" fi } # # Get persistent name for given disk # function get_persistent_disk_name #device { typeset device=$1 typeset dev_id if is_linux; then if is_real_device $device; then dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \ | egrep disk/by-id | nawk '{print $2; exit}' \ | nawk -F / '{print $3}')" echo $dev_id elif is_mpath_device $device; then dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \ | egrep disk/by-id/dm-uuid \ | nawk '{print $2; exit}' \ | nawk -F / '{print $3}')" echo $dev_id else echo $device fi else echo $device fi } # # Online or offline a disk on the system # # First checks state of disk. Test will fail if disk is not properly onlined # or offlined. Online is a full rescan of SCSI disks by echoing to every # host entry. # function on_off_disk # disk state{online,offline} host { typeset disk=$1 typeset state=$2 typeset host=$3 [[ -z $disk ]] || [[ -z $state ]] && \ log_fail "Arguments invalid or missing" if is_linux; then if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then dm_name="$(readlink $DEV_DSKDIR/$disk \ | nawk -F / '{print $2}')" dep="$(ls /sys/block/${dm_name}/slaves \ | nawk '{print $1}')" while [[ -n $dep ]]; do #check if disk is online lsscsi | egrep $dep > /dev/null if (($? == 0)); then dep_dir="/sys/block/${dm_name}" dep_dir+="/slaves/${dep}/device" ss="${dep_dir}/state" sd="${dep_dir}/delete" log_must eval "echo 'offline' > ${ss}" log_must eval "echo '1' > ${sd}" lsscsi | egrep $dep > /dev/null if (($? == 0)); then log_fail "Offlining" \ "$disk failed" fi fi dep="$(ls /sys/block/$dm_name/slaves \ 2>/dev/null | nawk '{print $1}')" done elif [[ $state == "offline" ]] && ( is_real_device $disk ); then #check if disk is online lsscsi | egrep $disk > /dev/null if (($? == 0)); then dev_state="/sys/block/$disk/device/state" dev_delete="/sys/block/$disk/device/delete" log_must eval "echo 'offline' > ${dev_state}" log_must eval "echo '1' > ${dev_delete}" lsscsi | egrep $disk > /dev/null if (($? == 0)); then log_fail "Offlining $disk" \ "failed" fi else log_note "$disk is already offline" fi elif [[ $state == "online" ]]; then #force a full rescan scan_scsi_hosts $host block_device_wait if is_mpath_device $disk; then dm_name="$(readlink $DEV_DSKDIR/$disk \ | nawk -F / '{print $2}')" dep="$(ls /sys/block/$dm_name/slaves \ | nawk '{print $1}')" lsscsi | egrep $dep > /dev/null if (($? != 0)); then log_fail "Onlining $disk failed" fi elif is_real_device $disk; then block_device_wait typeset -i retries=0 while ! lsscsi | egrep -q $disk; do if (( $retries > 2 )); then log_fail "Onlining $disk failed" break fi (( ++retries )) sleep 1 done else log_fail "$disk is not a real dev" fi else log_fail "$disk failed to $state" fi fi } # # Simulate disk removal # function remove_disk #disk { typeset disk=$1 on_off_disk $disk "offline" block_device_wait } # # Simulate disk insertion for the given SCSI host # function insert_disk #disk scsi_host { typeset disk=$1 typeset scsi_host=$2 on_off_disk $disk "online" $scsi_host block_device_wait } # # Load scsi_debug module with specified parameters # $blksz can be either one of: < 512b | 512e | 4Kn > # function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz { typeset devsize=$1 typeset hosts=$2 typeset tgts=$3 typeset luns=$4 typeset blksz=$5 [[ -z $devsize ]] || [[ -z $hosts ]] || [[ -z $tgts ]] || \ [[ -z $luns ]] || [[ -z $blksz ]] && \ log_fail "Arguments invalid or missing" case "$5" in '512b') typeset sector=512 typeset blkexp=0 ;; '512e') typeset sector=512 typeset blkexp=3 ;; '4Kn') typeset sector=4096 typeset blkexp=0 ;; *) log_fail "Unsupported blksz value: $5" ;; esac if is_linux; then modprobe -n scsi_debug if (($? != 0)); then log_unsupported "Platform does not have scsi_debug" "module" fi lsmod | egrep scsi_debug > /dev/null if (($? == 0)); then log_fail "scsi_debug module already installed" else log_must modprobe scsi_debug dev_size_mb=$devsize \ add_host=$hosts num_tgts=$tgts max_luns=$luns \ sector_size=$sector physblk_exp=$blkexp block_device_wait lsscsi | egrep scsi_debug > /dev/null if (($? == 1)); then log_fail "scsi_debug module install failed" fi fi fi } # # Unload scsi_debug module, if needed. # function unload_scsi_debug { log_must_retry "in use" 5 modprobe -r scsi_debug } # # Get scsi_debug device name. # Returns basename of scsi_debug device (for example "sdb"). # function get_debug_device { for i in {1..10} ; do val=$(lsscsi | nawk '/scsi_debug/ {print $6; exit}' | cut -d / -f3) # lsscsi can take time to settle if [ "$val" != "-" ] ; then break fi sleep 1 done echo "$val" } # # Get actual devices used by the pool (i.e. linux sdb1 not sdb). # function get_pool_devices #testpool #devdir { typeset testpool=$1 typeset devdir=$2 typeset out="" if is_linux || is_freebsd; then out=$(zpool status -P $testpool |grep ${devdir} | awk '{print $1}') out=$(echo $out | sed -e "s|${devdir}/||g" | tr '\n' ' ') fi echo $out } # # Write to standard out giving the level, device name, offset and length # of all blocks in an input file. The offset and length are in units of # 512 byte blocks. In the case of mirrored vdevs, only the first # device is listed, as the levels, blocks and offsets will be the same # on other devices. Note that this function only works with mirrored # or non-redundant pools, not raidz. # # The output of this function can be used to introduce corruption at # varying levels of indirection. # function list_file_blocks # input_file { typeset input_file=$1 [[ -f $input_file ]] || log_fail "Couldn't find $input_file" typeset ds="$(zfs list -H -o name $input_file)" typeset pool="${ds%%/*}" typeset objnum="$(get_objnum $input_file)" # # Establish a mapping between vdev ids as shown in a DVA and the # pathnames they correspond to in ${VDEV_MAP[][]}. # # The vdev bits in a DVA refer to the top level vdev id. # ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev. # eval $(zdb -C $pool | awk ' BEGIN { printf "typeset -a VDEV_MAP;" } function subscript(s) { # "[#]" is more convenient than the bare "#" match(s, /\[[0-9]*\]/) return substr(s, RSTART, RLENGTH) } id && !/^ / { # left a top level vdev id = 0 } id && $1 ~ /^path:$/ { # found a vdev path; save it in the map printf "VDEV_MAP%s%s=%s;", id, child, $2 } /^ children/ { # entering a top level vdev id = subscript($0) child = "[0]" # default in case there is no nested vdev printf "typeset -a VDEV_MAP%s;", id } /^ children/ { # entering a nested vdev (e.g. child of a top level mirror) child = subscript($0) } ') # # The awk below parses the output of zdb, printing out the level # of each block along with vdev id, offset and length. The last # two are converted to decimal in the while loop. 4M is added to # the offset to compensate for the first two labels and boot # block. Lastly, the offset and length are printed in units of # 512B blocks for ease of use with dd. # typeset level vdev path offset length if awk -n '' 2>/dev/null; then # gawk needs -n to decode hex AWK='awk -n' else AWK='awk' fi log_must zpool sync -f zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 ' /^$/ { looking = 0 } looking { level = $2 field = 3 while (split($field, dva, ":") == 3) { # top level vdev id vdev = int(dva[1]) # offset + 4M label/boot pad in 512B blocks offset = (int("0x"dva[2]) + pad) / bs # length in 512B blocks len = int("0x"dva[3]) / bs print level, vdev, offset, len ++field } } /^Indirect blocks:/ { looking = 1 } ' | \ while read level vdev offset length; do for path in ${VDEV_MAP[$vdev][@]}; do echo "$level $path $offset $length" done done 2>/dev/null } function corrupt_blocks_at_level # input_file corrupt_level { typeset input_file=$1 typeset corrupt_level="L${2:-0}" typeset level path offset length [[ -f $input_file ]] || log_fail "Couldn't find $input_file" if is_freebsd; then # Temporarily allow corrupting an inuse device. debugflags=$(sysctl -n kern.geom.debugflags) sysctl kern.geom.debugflags=16 fi list_file_blocks $input_file | \ while read level path offset length; do if [[ $level = $corrupt_level ]]; then log_must dd if=/dev/urandom of=$path bs=512 \ count=$length seek=$offset conv=notrunc fi done if is_freebsd; then sysctl kern.geom.debugflags=$debugflags fi # This is necessary for pools made of loop devices. sync } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am index bf33ed038d78..35332f822e6c 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am @@ -1,20 +1,21 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_share dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ zfs_share_001_pos.ksh \ zfs_share_002_pos.ksh \ zfs_share_003_pos.ksh \ zfs_share_004_pos.ksh \ zfs_share_005_pos.ksh \ zfs_share_006_pos.ksh \ zfs_share_007_neg.ksh \ zfs_share_008_neg.ksh \ zfs_share_009_neg.ksh \ zfs_share_010_neg.ksh \ zfs_share_011_pos.ksh \ zfs_share_012_pos.ksh \ + zfs_share_013_pos.ksh \ zfs_share_concurrent_shares.ksh dist_pkgdata_DATA = \ zfs_share.cfg diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh index 29ca9a143a27..c64157cee601 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh @@ -1,85 +1,85 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2016 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib # # DESCRIPTION: # Verify that invalid share parameters and options are caught. # # STRATEGY: # 1. Create a ZFS file system. # 2. For each option in the list, set the sharenfs property. # 3. Verify that the error code and sharenfs property. # verify_runnable "both" function cleanup { if is_global_zone; then log_must zfs set sharenfs=off $TESTPOOL/$TESTFS fi } set -A badopts \ "r0" "r0=machine1" "r0=machine1:machine2" \ - "-g" "-b" "-c" "-d" "--invalid" \ + "-g" "-b" "-c" "-d" "--invalid" "rw=[::1]a:[::2]" "rw=[::1" \ "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL\$TESTCTR\$TESTFS1" log_assert "Verify that invalid share parameters and options are caught." log_onexit cleanup typeset -i i=0 while (( i < ${#badopts[*]} )) do log_note "Setting sharenfs=${badopts[i]} $i " log_mustnot zfs set sharenfs="${badopts[i]}" $TESTPOOL/$TESTFS showshares_nfs | grep $option > /dev/null 2>&1 if (( $? == 0 )); then log_fail "An invalid setting '$option' was propagated." fi # # To global zone, sharenfs must be set 'off' before malformed testing. # Otherwise, the malformed test return '0'. # # To non-global zone, sharenfs can be set even 'off' or 'on'. # if is_global_zone; then log_note "Resetting sharenfs option" log_must zfs set sharenfs=off $TESTPOOL/$TESTFS fi ((i = i + 1)) done log_pass "Invalid share parameters and options we caught as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_013_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_013_pos.ksh new file mode 100755 index 000000000000..150eddac0ebb --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_013_pos.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Felix Dörre +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that NFS share options including ipv6 literals are parsed and propagated correctly. +# + +verify_runnable "global" + +function cleanup +{ + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS + is_shared $TESTPOOL/$TESTFS && \ + log_must unshare_fs $TESTPOOL/$TESTFS +} + +log_onexit cleanup + +cleanup + +log_must zfs set sharenfs="rw=[::1]" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[2::3]" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "2::3(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]:[2::3]" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1(" <<< "$output" > /dev/null +log_must grep "2::3(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]/64" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1/64(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[2::3]/128" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "2::3/128(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]/32:[2::3]/128" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1/32(" <<< "$output" > /dev/null +log_must grep "2::3/128(" <<< "$output" > /dev/null + +log_must zfs set sharenfs="rw=[::1]:[2::3]/64:[2a01:1234:1234:1234:aa34:234:1234:1234]:1.2.3.4/24" $TESTPOOL/$TESTFS +output=$(showshares_nfs 2>&1) +log_must grep "::1(" <<< "$output" > /dev/null +log_must grep "2::3/64(" <<< "$output" > /dev/null +log_must grep "2a01:1234:1234:1234:aa34:234:1234:1234(" <<< "$output" > /dev/null +log_must grep "1\\.2\\.3\\.4/24(" <<< "$output" > /dev/null + +log_pass "NFS share ip address propagated correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh index 987ecca31396..7da8be3d17eb 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh @@ -1,193 +1,193 @@ #!/bin/ksh -p # # CDDL HEADER START # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy is of the CDDL is also available via the Internet # at http://www.illumos.org/license/CDDL. # # CDDL HEADER END # # # Copyright (c) 2018 Datto Inc. # . $STF_SUITE/include/libtest.shlib # # DESCRIPTION: # zfs unmount should work on nested datasets # # STRATEGY: # 1. Create a set of nested datasets # 2. Unmount a nested dataset and make sure it is unmounted # 3. Ensure the dataset deeper than the one above is also unmounted # 4. Ensure the datasets shallower than the unmounted one is still mounted # 5. Repeat from step 2 with other mountpoint values and shallower nesting # verify_runnable "both" function nesting_cleanup { log_must zfs destroy -fR $TESTPOOL/a log_must zfs destroy -fR $TESTPOOL/b log_must zfs destroy -fR $TESTPOOL/c log_must zfs destroy -fR $TESTPOOL/d } log_onexit nesting_cleanup set -A test_depths 30 16 3 typeset mountpoint=/$TESTPOOL/mnt dsA32=$(printf 'a/%.0s' {1..32})"a" log_must zfs create -p $TESTPOOL/$dsA32 dsB32=$(printf 'b/%.0s' {1..32})"b" log_must zfs create -o mountpoint=none -p $TESTPOOL/$dsB32 # FreeBSD's mount command ignores the mountpoint property. if ! is_freebsd; then log_mustnot mount -t zfs $TESTPOOL/$dsB32 /mnt fi dsC32=$(printf 'c/%.0s' {1..32})"c" log_must zfs create -o mountpoint=legacy -p $TESTPOOL/$dsC32 log_must mount -t zfs $TESTPOOL/$dsC32 /mnt dsD32=$(printf 'd/%.0s' {1..32})"d" log_must zfs create -o mountpoint=$mountpoint -p $TESTPOOL/$dsD32 for d in ${test_depths[@]}; do # default mountpoint ds_pre=$(printf 'a/%.0s' {1..$(($d-2))})"a" ds=$(printf 'a/%.0s' {1..$(($d-1))})"a" ds_post=$(printf 'a/%.0s' {1..$(($d))})"a" if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not initially mounted" fi if ! ismounted $TESTPOOL/$ds; then log_fail "$TESTPOOL/$ds not initially mounted" fi if ! ismounted $TESTPOOL/$ds_post; then log_fail "$TESTPOOL/$ds_post (post) not initially mounted" fi log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$ds_pre is not mounted" fi if ismounted $TESTPOOL/$ds; then log_fail "$ds is mounted" fi if ismounted $TESTPOOL/$ds_post; then log_fail "$ds_post (post) is mounted" fi # mountpoint=none ds_pre=$(printf 'b/%.0s' {1..$(($d-2))})"b" ds=$(printf 'b/%.0s' {1..$(($d-1))})"b" ds_post=$(printf 'b/%.0s' {1..$(($d))})"b" if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not initially mounted" fi if ! ismounted $TESTPOOL/$ds; then log_fail "$TESTPOOL/$ds not initially mounted" fi if ! ismounted $TESTPOOL/$ds_post; then log_fail "$TESTPOOL/$ds_post (post) not initially mounted" fi log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not mounted" fi if ismounted $TESTPOOL/$ds; then log_fail "$TESTPOOL/$ds is mounted" fi if ismounted $TESTPOOL/$ds_post; then log_fail "$TESTPOOL/$ds_post (post) is mounted" fi # mountpoint=legacy ds_pre=$(printf 'c/%.0s' {1..$(($d-2))})"c" ds=$(printf 'c/%.0s' {1..$(($d-1))})"c" ds_post=$(printf 'c/%.0s' {1..$(($d))})"c" if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not initially mounted" fi if ! ismounted $TESTPOOL/$ds; then log_fail "$TESTPOOL/$ds not initially mounted" fi if ! ismounted $TESTPOOL/$ds_post; then log_fail "$TESTPOOL/$ds_post (post) not initially mounted" fi log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not mounted" fi if ismounted $TESTPOOL/$ds; then log_fail "$TESTPOOL/$ds is mounted" fi if ismounted $TESTPOOL/$ds_post; then log_fail "$TESTPOOL/$ds_post (post) is mounted" fi # mountpoint=/testpool/mnt ds_pre=$(printf 'd/%.0s' {1..$(($d-2))})"d" ds=$(printf 'd/%.0s' {1..$(($d-1))})"d" ds_post=$(printf 'd/%.0s' {1..$(($d))})"d" if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$TESTPOOL/$ds_pre (pre) not initially mounted" fi if ! ismounted $TESTPOOL/$ds; then log_fail "$TESTPOOL/$ds not initially mounted" fi if ! ismounted $TESTPOOL/$ds_post; then log_fail "$TESTPOOL/$ds_post (post) not initially mounted" fi log_must zfs snapshot $TESTPOOL/$ds@snap # force snapshot mount in .zfs log_must ls /$TESTPOOL/$ds/.zfs/snapshot/snap - log_must zfs unmount $TESTPOOL/$ds + log_must_nostderr zfs unmount $TESTPOOL/$ds if ! ismounted $TESTPOOL/$ds_pre; then log_fail "$ds_pre is not mounted" fi if ismounted $TESTPOOL/$ds; then log_fail "$ds is mounted" fi if ismounted $TESTPOOL/$ds_post; then log_fail "$ds_post (post) is mounted" fi done log_must rmdir $mountpoint # remove the mountpoint we created log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL log_pass "Verified nested dataset are unmounted." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/Makefile.am index 92a973258d65..817bd41e8075 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/Makefile.am @@ -1,15 +1,17 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/compression dist_pkgdata_SCRIPTS = \ cleanup.ksh \ setup.ksh \ compress_001_pos.ksh \ compress_002_pos.ksh \ compress_003_pos.ksh \ compress_004_pos.ksh \ + compress_zstd_bswap.ksh \ l2arc_compressed_arc.ksh \ l2arc_compressed_arc_disabled.ksh \ l2arc_encrypted.ksh \ l2arc_encrypted_no_compressed_arc.ksh dist_pkgdata_DATA = \ - compress.cfg + compress.cfg \ + testpool_zstd.tar.gz diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/compress_zstd_bswap.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/compress_zstd_bswap.ksh new file mode 100755 index 000000000000..9726cf0dd5a6 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/compress_zstd_bswap.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2007, Sun Microsystems Inc. All rights reserved. +# Copyright (c) 2021, Rich Ercolani. +# Use is subject to license terms. +# + +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Import a pool containing variously-permuted zstd-compressed files, +# then try to copy them out. + +typeset TESTPOOL_ZSTD_FILE=$STF_SUITE/tests/functional/compression/testpool_zstd.tar.gz +verify_runnable "both" + +function cleanup +{ + destroy_pool testpool_zstd + rm -f $TEST_BASE_DIR/testpool_zstd + +} + +log_assert "Trying to read data from variously mangled zstd datasets" +log_onexit cleanup + +log_must tar --directory $TEST_BASE_DIR -xzSf $TESTPOOL_ZSTD_FILE +log_must zpool import -d $TEST_BASE_DIR testpool_zstd +log_must dd if=/testpool_zstd/x86_64/zstd of=/dev/null +log_must dd if=/testpool_zstd/ppc64_fbsd/zstd of=/dev/null + +log_pass "Reading from mangled zstd datasets works as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/testpool_zstd.tar.gz b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/testpool_zstd.tar.gz new file mode 100644 index 000000000000..4096f7fcbe44 Binary files /dev/null and b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/compression/testpool_zstd.tar.gz differ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/Makefile.am index 3174f78c6249..e7479ae81056 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/Makefile.am @@ -1,15 +1,13 @@ include $(top_srcdir)/config/Rules.am pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/ctime dist_pkgdata_SCRIPTS = \ ctime_001_pos.ksh \ cleanup.ksh \ setup.ksh pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/ctime pkgexec_PROGRAMS = ctime ctime_SOURCES = ctime.c - -ctime_LDADD = $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/ctime.c b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/ctime.c index 2d515d957a90..b755be2feb29 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/ctime.c +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/ctime/ctime.c @@ -1,373 +1,379 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013 by Delphix. All rights reserved. */ #include #include #ifndef __FreeBSD__ #include #endif #include #include #include -#include #include #include #include #include #include #include #define ST_ATIME 0 #define ST_CTIME 1 #define ST_MTIME 2 #define ALL_MODE (mode_t)(S_IRWXU|S_IRWXG|S_IRWXO) typedef struct timetest { int type; char *name; int (*func)(const char *pfile); } timetest_t; static char tfile[BUFSIZ] = { 0 }; /* * DESCRIPTION: * Verify time will be changed correctly after each operation. * * STRATEGY: * 1. Define time test array. * 2. Loop through each item in this array. * 3. Verify the time is changed after each operation. * */ static int get_file_time(const char *pfile, int what, time_t *ptr) { struct stat stat_buf; if (pfile == NULL || ptr == NULL) { return (-1); } if (stat(pfile, &stat_buf) == -1) { return (-1); } switch (what) { case ST_ATIME: *ptr = stat_buf.st_atime; return (0); case ST_CTIME: *ptr = stat_buf.st_ctime; return (0); case ST_MTIME: *ptr = stat_buf.st_mtime; return (0); default: return (-1); } } +static ssize_t +get_dirnamelen(const char *path) +{ + const char *end = strrchr(path, '/'); + return (end ? end - path : -1); +} + static int do_read(const char *pfile) { int fd, ret = 0; char buf[BUFSIZ] = { 0 }; if (pfile == NULL) { return (-1); } if ((fd = open(pfile, O_RDONLY, ALL_MODE)) == -1) { return (-1); } if (read(fd, buf, sizeof (buf)) == -1) { (void) fprintf(stderr, "read(%d, buf, %zd) failed with errno " "%d\n", fd, sizeof (buf), errno); (void) close(fd); return (1); } (void) close(fd); return (ret); } static int do_write(const char *pfile) { int fd, ret = 0; char buf[BUFSIZ] = "call function do_write()"; if (pfile == NULL) { return (-1); } if ((fd = open(pfile, O_WRONLY, ALL_MODE)) == -1) { return (-1); } if (write(fd, buf, strlen(buf)) == -1) { (void) fprintf(stderr, "write(%d, buf, %d) failed with errno " "%d\n", fd, (int)strlen(buf), errno); (void) close(fd); return (1); } (void) close(fd); return (ret); } static int do_link(const char *pfile) { int ret = 0; char link_file[BUFSIZ + 16] = { 0 }; if (pfile == NULL) { return (-1); } /* * Figure out source file directory name, and create * the link file in the same directory. */ (void) snprintf(link_file, sizeof (link_file), - "%.*s/%s", (int)zfs_dirnamelen(pfile), pfile, "link_file"); + "%.*s/%s", (int)get_dirnamelen(pfile), pfile, "link_file"); if (link(pfile, link_file) == -1) { (void) fprintf(stderr, "link(%s, %s) failed with errno %d\n", pfile, link_file, errno); return (1); } (void) unlink(link_file); return (ret); } static int do_creat(const char *pfile) { int fd, ret = 0; if (pfile == NULL) { return (-1); } if ((fd = creat(pfile, ALL_MODE)) == -1) { (void) fprintf(stderr, "creat(%s, ALL_MODE) failed with errno " "%d\n", pfile, errno); return (1); } (void) close(fd); return (ret); } static int do_utime(const char *pfile) { int ret = 0; if (pfile == NULL) { return (-1); } /* * Times of the file are set to the current time */ if (utime(pfile, NULL) == -1) { (void) fprintf(stderr, "utime(%s, NULL) failed with errno " "%d\n", pfile, errno); return (1); } return (ret); } static int do_chmod(const char *pfile) { int ret = 0; if (pfile == NULL) { return (-1); } if (chmod(pfile, ALL_MODE) == -1) { (void) fprintf(stderr, "chmod(%s, ALL_MODE) failed with " "errno %d\n", pfile, errno); return (1); } return (ret); } static int do_chown(const char *pfile) { int ret = 0; if (pfile == NULL) { return (-1); } if (chown(pfile, getuid(), getgid()) == -1) { (void) fprintf(stderr, "chown(%s, %d, %d) failed with errno " "%d\n", pfile, (int)getuid(), (int)getgid(), errno); return (1); } return (ret); } #ifndef __FreeBSD__ static int do_xattr(const char *pfile) { int ret = 0; char *value = "user.value"; if (pfile == NULL) { return (-1); } if (setxattr(pfile, "user.x", value, strlen(value), 0) == -1) { (void) fprintf(stderr, "setxattr(%s, %d, %d) failed with errno " "%d\n", pfile, (int)getuid(), (int)getgid(), errno); return (1); } return (ret); } #endif static void cleanup(void) { if ((strlen(tfile) != 0) && (access(tfile, F_OK) == 0)) { (void) unlink(tfile); } } static timetest_t timetest_table[] = { { ST_ATIME, "st_atime", do_read }, { ST_ATIME, "st_atime", do_utime }, { ST_MTIME, "st_mtime", do_creat }, { ST_MTIME, "st_mtime", do_write }, { ST_MTIME, "st_mtime", do_utime }, { ST_CTIME, "st_ctime", do_creat }, { ST_CTIME, "st_ctime", do_write }, { ST_CTIME, "st_ctime", do_chmod }, { ST_CTIME, "st_ctime", do_chown }, { ST_CTIME, "st_ctime", do_link }, { ST_CTIME, "st_ctime", do_utime }, #ifndef __FreeBSD__ { ST_CTIME, "st_ctime", do_xattr }, #endif }; #define NCOMMAND (sizeof (timetest_table) / sizeof (timetest_table[0])) /* ARGSUSED */ int main(int argc, char *argv[]) { int i, ret, fd; char *penv[] = {"TESTDIR", "TESTFILE0"}; (void) atexit(cleanup); /* * Get the environment variable values. */ for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { if ((penv[i] = getenv(penv[i])) == NULL) { (void) fprintf(stderr, "getenv(penv[%d])\n", i); return (1); } } (void) snprintf(tfile, sizeof (tfile), "%s/%s", penv[0], penv[1]); /* * If the test file exists, remove it first. */ if (access(tfile, F_OK) == 0) { (void) unlink(tfile); } ret = 0; if ((fd = open(tfile, O_WRONLY | O_CREAT | O_TRUNC, ALL_MODE)) == -1) { (void) fprintf(stderr, "open(%s) failed: %d\n", tfile, errno); return (1); } (void) close(fd); for (i = 0; i < NCOMMAND; i++) { time_t t1, t2; /* * Get original time before operating. */ ret = get_file_time(tfile, timetest_table[i].type, &t1); if (ret != 0) { (void) fprintf(stderr, "get_file_time(%s %d) = %d\n", tfile, timetest_table[i].type, ret); return (1); } /* * Sleep 2 seconds, then invoke command on given file */ (void) sleep(2); timetest_table[i].func(tfile); /* * Get time after operating. */ ret = get_file_time(tfile, timetest_table[i].type, &t2); if (ret != 0) { (void) fprintf(stderr, "get_file_time(%s %d) = %d\n", tfile, timetest_table[i].type, ret); return (1); } if (t1 == t2) { (void) fprintf(stderr, "%s: t1(%ld) == t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); return (1); } else { (void) fprintf(stderr, "%s: t1(%ld) != t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); } } return (0); } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh index b0b03f5d523e..fd6e8c858edd 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh @@ -1,90 +1,90 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2017 by Lawrence Livermore National Security, LLC. # Use is subject to license terms. # # DESCRIPTION: # Verify spa deadman detects a hung txg # # STRATEGY: # 1. Reduce the zfs_deadman_synctime_ms to 5s. # 2. Reduce the zfs_deadman_checktime_ms to 1s. # 3. Inject a 10s zio delay to force long IOs. # 4. Write enough data to force a long txg sync time due to the delay. # 5. Verify a "deadman" event is posted. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/deadman/deadman.cfg verify_runnable "both" function cleanup { log_must zinject -c all default_cleanup_noexit log_must set_tunable64 DEADMAN_SYNCTIME_MS $SYNCTIME_DEFAULT log_must set_tunable64 DEADMAN_CHECKTIME_MS $CHECKTIME_DEFAULT log_must set_tunable64 DEADMAN_FAILMODE $FAILMODE_DEFAULT } log_assert "Verify spa deadman detects a hung txg" log_onexit cleanup log_must set_tunable64 DEADMAN_SYNCTIME_MS 5000 log_must set_tunable64 DEADMAN_CHECKTIME_MS 1000 log_must set_tunable64 DEADMAN_FAILMODE "wait" # Create a new pool in order to use the updated deadman settings. default_setup_noexit $DISK1 log_must zpool events -c # Force each IO to take 10s by allow them to run concurrently. log_must zinject -d $DISK1 -D10000:10 $TESTPOOL mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file sleep 10 log_must zinject -c all log_must zpool sync # Log txg sync times for reference and the zpool event summary. if is_freebsd; then log_must sysctl -n kstat.zfs.$TESTPOOL.txgs else log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs fi log_must zpool events -# Verify at least 5 deadman events were logged. The first after 5 seconds, +# Verify at least 4 deadman events were logged. The first after 5 seconds, # and another each second thereafter until the delay is clearer. events=$(zpool events | grep -c ereport.fs.zfs.deadman) -if [ "$events" -lt 5 ]; then +if [ "$events" -lt 4 ]; then log_fail "Expect >=5 deadman events, $events found" fi log_pass "Verify spa deadman detected a hung txg and $events deadman events" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh index 6dfb3423fdf9..26573bfb594d 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh @@ -1,33 +1,45 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2014, 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/rsend/rsend.kshlib # # Description: # Verify that error conditions don't cause panics in zfs send # # Strategy: # 1. Perform a zfs incremental send from a bookmark that doesn't exist +# 2. Perform a zfs incremental replication send with incremental source +# same as target (#11121) # verify_runnable "both" -log_neg eval "zfs send -i \#bla $POOl/$FS@final > /dev/null" +function cleanup +{ + rm -f $TEST_BASE_DIR/devnull +} + +log_onexit cleanup + +log_mustnot eval "zfs send -i \#bla $POOl/$FS@final > $TEST_BASE_DIR/devnull" + +log_must eval "zfs send -R -i snapA $POOL/vol@snapA 2>&1 " \ + "> $TEST_BASE_DIR/devnull | grep -q WARNING" log_pass "Ensure that error conditions cause appropriate failures." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib index 3ee09a151b12..b69d2ce02913 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib @@ -1,143 +1,142 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2017, loli10K . All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib . $STF_SUITE/tests/functional/zvol/zvol_common.shlib # # Wait for udev to settle, completely. # This is quite discomforting, but there's a race condition here # (Amazon 2015.09 x86_64 Release (TEST) is good at triggering this) where the # kernel tries to remove zvol device nodes while they're open by [blkid], # [zvol_id] or other udev related processes. # Calling 'udevadm settle' is not enough: wait for those processes "manually". # function udev_wait { - sleep 1 - is_linux || return 0 udevadm trigger --action=change udevadm settle for i in {1..3}; do blkid="$(pgrep blkid | wc -l)" zvol_id="$(pgrep zvol_id | wc -l)" [[ "0" == "$zvol_id" && "0" == "$blkid" ]] && return udevadm settle done log_fail "Wait timeout reached for udev_wait" } # # Clean up udev status # This is also a problem on "Amazon 2015.09 x86_64 Release (TEST)" where udev, # sometimes, does not clean up /dev/zvol symlinks correctly for removed ZVOLs. # Prune those links manually, then tell udev to forget them. # function udev_cleanup { - is_linux || return 0 log_note "Pruning broken ZVOL symlinks ..." udevadm settle # find all dangling links and delete them find -L "${ZVOL_DEVDIR}" -type l -print -delete # purge those links from udev database udevadm info --cleanup-db } # # Verify $device exists and is a block device # function blockdev_exists # device { typeset device="$1" # we wait here instead of doing it in a wrapper around 'zfs set snapdev' # because there are other commands (zfs snap, zfs inherit, zfs destroy) # that can affect device nodes for i in {1..3}; do - udev_wait + is_linux && udev_wait + block_device_wait "$device" is_disk_device "$device" && return 0 done log_fail "$device does not exist as a block device" } # # Verify $device does not exist # function blockdev_missing # device { typeset device="$1" # we wait here instead of doing it in a wrapper around 'zfs set snapdev' # because there are other commands (zfs snap, zfs inherit, zfs destroy) # that can affect device nodes for i in {1..3}; do - udev_wait - [[ ! -e "$device" ]] && return 0 + is_linux && udev_wait + block_device_wait + is_disk_device "$device" || return 0 done log_fail "$device exists when not expected" } # # Verify $property on $dataset is inherited by $parent and is set to $value # function verify_inherited # property value dataset parent { typeset property="$1" typeset value="$2" typeset dataset="$3" typeset parent="$4" typeset val=$(get_prop "$property" "$dataset") typeset src=$(get_source "$property" "$dataset") if [[ "$val" != "$value" || "$src" != "inherited from $parent" ]]; then log_fail "Dataset $dataset did not inherit $property properly:"\ "expected=$value, value=$val, source=$src." fi } # # Create a small partition on $device, then verify if we can access it # function verify_partition # device { typeset device="$1" if ! is_disk_device "$device"; then log_fail "$device is not a block device" fi # create a small dummy partition set_partition 0 "" 1m $device # verify we can access the partition on the device devname="$(readlink -f "$device")" if is_linux || is_freebsd; then is_disk_device "$devname""p1" else is_disk_device "$devname""s0" fi return $? } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh index c4ea23c48d16..e9b7bbc4c4a5 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh @@ -1,71 +1,71 @@ #!/bin/ksh -p # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright 2019, loli10K . All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/zvol/zvol_common.shlib . $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib # # DESCRIPTION: # Verify 'zfs rename' works on a ZVOL already in use as block device # # STRATEGY: # 1. Create a ZVOL # 2. Create a filesystem on the ZVOL device and mount it # 3. Rename the ZVOL dataset # 4. Receive a send stream with the same name as the old ZVOL dataset and verify # we don't trigger any issue like the one reported in #6263 # verify_runnable "global" function cleanup { log_must umount "$MNTPFS" log_must rmdir "$MNTPFS" for ds in "$SENDFS" "$ZVOL" "$ZVOL-renamed"; do destroy_dataset "$ds" '-rf' done - udev_wait + block_device_wait } log_assert "Verify 'zfs rename' works on a ZVOL already in use as block device" log_onexit cleanup ZVOL="$TESTPOOL/vol.$$" ZDEV="$ZVOL_DEVDIR/$ZVOL" MNTPFS="$TESTDIR/zvol_inuse_rename" SENDFS="$TESTPOOL/sendfs.$$" # 1. Create a ZVOL log_must zfs create -V $VOLSIZE "$ZVOL" # 2. Create a filesystem on the ZVOL device and mount it -udev_wait +block_device_wait "$ZDEV" log_must eval "new_fs $ZDEV >/dev/null 2>&1" log_must mkdir "$MNTPFS" log_must mount "$ZDEV" "$MNTPFS" # 3. Rename the ZVOL dataset log_must zfs rename "$ZVOL" "$ZVOL-renamed" # 4. Receive a send stream with the same name as the old ZVOL dataset and verify # we don't trigger any issue like the one reported in #6263 log_must zfs create "$SENDFS" log_must zfs snap "$SENDFS@snap" log_must eval "zfs send $SENDFS@snap | zfs recv $ZVOL" log_pass "Renaming in use ZVOL works successfully" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh index 8d95bfa39374..bbe40734f99c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh @@ -1,121 +1,121 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2017, loli10K . All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib . $STF_SUITE/tests/functional/zvol/zvol_common.shlib . $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib # # DESCRIPTION: # Verify that ZFS volume property "snapdev" works as intended. # # STRATEGY: # 1. Verify "snapdev" property does not accept invalid values # 2. Verify "snapdev" adds and removes device nodes when updated # 3. Verify "snapdev" is inherited correctly # verify_runnable "global" function cleanup { datasetexists $VOLFS && log_must zfs destroy -r $VOLFS datasetexists $ZVOL && log_must zfs destroy -r $ZVOL log_must zfs inherit snapdev $TESTPOOL block_device_wait - udev_cleanup + is_linux && udev_cleanup } log_assert "Verify that ZFS volume property 'snapdev' works as expected." log_onexit cleanup VOLFS="$TESTPOOL/volfs" ZVOL="$TESTPOOL/vol" SNAP="$ZVOL@snap" SNAPDEV="${ZVOL_DEVDIR}/$SNAP" SUBZVOL="$VOLFS/subvol" SUBSNAP="$SUBZVOL@snap" SUBSNAPDEV="${ZVOL_DEVDIR}/$SUBSNAP" log_must zfs create -o mountpoint=none $VOLFS log_must zfs create -V $VOLSIZE -s $ZVOL log_must zfs create -V $VOLSIZE -s $SUBZVOL # 1. Verify "snapdev" property does not accept invalid values typeset badvals=("off" "on" "1" "nope" "-") for badval in ${badvals[@]} do log_mustnot zfs set snapdev="$badval" $ZVOL done # 2. Verify "snapdev" adds and removes device nodes when updated # 2.1 First create a snapshot then change snapdev property log_must zfs snapshot $SNAP log_must zfs set snapdev=visible $ZVOL blockdev_exists $SNAPDEV log_must zfs set snapdev=hidden $ZVOL blockdev_missing $SNAPDEV log_must zfs destroy $SNAP # 2.2 First set snapdev property then create a snapshot log_must zfs set snapdev=visible $ZVOL log_must zfs snapshot $SNAP blockdev_exists $SNAPDEV log_must zfs destroy $SNAP blockdev_missing $SNAPDEV # 2.3 Verify setting to the same value multiple times does not lead to issues log_must zfs snapshot $SNAP log_must zfs set snapdev=visible $ZVOL blockdev_exists $SNAPDEV log_must zfs set snapdev=visible $ZVOL blockdev_exists $SNAPDEV log_must zfs set snapdev=hidden $ZVOL blockdev_missing $SNAPDEV log_must zfs set snapdev=hidden $ZVOL blockdev_missing $SNAPDEV log_must zfs destroy $SNAP # 3. Verify "snapdev" is inherited correctly # 3.1 Check snapdev=visible case log_must zfs snapshot $SNAP log_must zfs inherit snapdev $ZVOL log_must zfs set snapdev=visible $TESTPOOL verify_inherited 'snapdev' 'visible' $ZVOL $TESTPOOL blockdev_exists $SNAPDEV # 3.2 Check snapdev=hidden case log_must zfs set snapdev=hidden $TESTPOOL verify_inherited 'snapdev' 'hidden' $ZVOL $TESTPOOL blockdev_missing $SNAPDEV # 3.3 Check inheritance on multiple levels log_must zfs snapshot $SUBSNAP log_must zfs inherit snapdev $SUBZVOL log_must zfs set snapdev=hidden $VOLFS log_must zfs set snapdev=visible $TESTPOOL verify_inherited 'snapdev' 'hidden' $SUBZVOL $VOLFS blockdev_missing $SUBSNAPDEV blockdev_exists $SNAPDEV log_pass "ZFS volume property 'snapdev' works as expected" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_volmode.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_volmode.ksh index 322a31e07d89..264ec131ad6b 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_volmode.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_volmode.ksh @@ -1,244 +1,246 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2017, loli10K . All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib . $STF_SUITE/tests/functional/zvol/zvol_common.shlib . $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib # # DESCRIPTION: # Verify that ZFS volume property "volmode" works as intended. # # STRATEGY: # 1. Verify "volmode" property does not accept invalid values # 2. Verify "volmode=none" hides ZVOL device nodes # 3. Verify "volmode=full" exposes a fully functional device # 4. Verify "volmode=dev" hides partition info on the device # 5. Verify "volmode=default" behaves accordingly to "volmode" module parameter # 6. Verify "volmode" property is inherited correctly # 7. Verify "volmode" behaves correctly at import time # 8. Verify "volmode" behaves accordingly to zvol_inhibit_dev (Linux only) # # NOTE: changing volmode may need to remove minors, which could be open, so call -# udev_wait() before we "zfs set volmode=". +# block_device_wait() before we "zfs set volmode=". verify_runnable "global" function cleanup { - datasetexists $VOLFS && log_must_busy zfs destroy -r $VOLFS - datasetexists $ZVOL && log_must_busy zfs destroy -r $ZVOL - log_must zfs inherit volmode $TESTPOOL - udev_wait + datasetexists $VOLFS && destroy_dataset $VOLFS -r + datasetexists $ZVOL && destroy_dataset $ZVOL -r + zfs inherit volmode $TESTPOOL sysctl_inhibit_dev 0 sysctl_volmode 1 - udev_cleanup + is_linux && udev_cleanup } # # Set zvol_inhibit_dev tunable to $value # function sysctl_inhibit_dev # value { typeset value="$1" if is_linux; then log_note "Setting zvol_inhibit_dev tunable to $value" log_must set_tunable32 VOL_INHIBIT_DEV $value fi } # # Set volmode tunable to $value # function sysctl_volmode # value { typeset value="$1" log_note "Setting volmode tunable to $value" log_must set_tunable32 VOL_MODE $value } # # Exercise open and close, read and write operations # function test_io # dev { typeset dev=$1 - log_must dd if=/dev/zero of=$dev count=1 log_must dd if=$dev of=/dev/null count=1 + log_must dd if=/dev/zero of=$dev count=1 } log_assert "Verify that ZFS volume property 'volmode' works as intended" log_onexit cleanup VOLFS="$TESTPOOL/volfs" ZVOL="$TESTPOOL/vol" -ZDEV="${ZVOL_DEVDIR}/$ZVOL" +ZDEV="$ZVOL_DEVDIR/$ZVOL" SUBZVOL="$VOLFS/subvol" -SUBZDEV="${ZVOL_DEVDIR}/$SUBZVOL" +SUBZDEV="$ZVOL_DEVDIR/$SUBZVOL" +# 0. Verify basic ZVOL functionality log_must zfs create -o mountpoint=none $VOLFS log_must zfs create -V $VOLSIZE -s $SUBZVOL log_must zfs create -V $VOLSIZE -s $ZVOL -udev_wait blockdev_exists $ZDEV blockdev_exists $SUBZDEV test_io $ZDEV test_io $SUBZDEV # 1. Verify "volmode" property does not accept invalid values typeset badvals=("off" "on" "1" "nope" "-") for badval in ${badvals[@]} do log_mustnot zfs set volmode="$badval" $ZVOL done # 2. Verify "volmode=none" hides ZVOL device nodes log_must zfs set volmode=none $ZVOL blockdev_missing $ZDEV log_must_busy zfs destroy $ZVOL +blockdev_missing $ZDEV # 3. Verify "volmode=full" exposes a fully functional device log_must zfs create -V $VOLSIZE -s $ZVOL -udev_wait +blockdev_exists $ZDEV log_must zfs set volmode=full $ZVOL blockdev_exists $ZDEV test_io $ZDEV log_must verify_partition $ZDEV -udev_wait # 3.1 Verify "volmode=geom" is an alias for "volmode=full" log_must zfs set volmode=geom $ZVOL blockdev_exists $ZDEV if [[ "$(get_prop 'volmode' $ZVOL)" != "full" ]]; then log_fail " Volmode value 'geom' is not an alias for 'full'" fi -udev_wait log_must_busy zfs destroy $ZVOL +blockdev_missing $ZDEV # 4. Verify "volmode=dev" hides partition info on the device log_must zfs create -V $VOLSIZE -s $ZVOL -udev_wait +blockdev_exists $ZDEV log_must zfs set volmode=dev $ZVOL blockdev_exists $ZDEV test_io $ZDEV log_mustnot verify_partition $ZDEV -udev_wait log_must_busy zfs destroy $ZVOL +blockdev_missing $ZDEV # 5. Verify "volmode=default" behaves accordingly to "volmode" module parameter # 5.1 Verify sysctl "volmode=full" sysctl_volmode 1 log_must zfs create -V $VOLSIZE -s $ZVOL -udev_wait +blockdev_exists $ZDEV log_must zfs set volmode=default $ZVOL blockdev_exists $ZDEV log_must verify_partition $ZDEV -udev_wait log_must_busy zfs destroy $ZVOL +blockdev_missing $ZDEV # 5.2 Verify sysctl "volmode=dev" sysctl_volmode 2 log_must zfs create -V $VOLSIZE -s $ZVOL -udev_wait +blockdev_exists $ZDEV log_must zfs set volmode=default $ZVOL blockdev_exists $ZDEV log_mustnot verify_partition $ZDEV -udev_wait log_must_busy zfs destroy $ZVOL +blockdev_missing $ZDEV # 5.2 Verify sysctl "volmode=none" sysctl_volmode 3 log_must zfs create -V $VOLSIZE -s $ZVOL -udev_wait +blockdev_missing $ZDEV log_must zfs set volmode=default $ZVOL blockdev_missing $ZDEV # 6. Verify "volmode" property is inherited correctly log_must zfs inherit volmode $ZVOL +blockdev_missing $ZDEV # 6.1 Check volmode=full case log_must zfs set volmode=full $TESTPOOL verify_inherited 'volmode' 'full' $ZVOL $TESTPOOL blockdev_exists $ZDEV # 6.2 Check volmode=none case log_must zfs set volmode=none $TESTPOOL verify_inherited 'volmode' 'none' $ZVOL $TESTPOOL blockdev_missing $ZDEV # 6.3 Check volmode=dev case log_must zfs set volmode=dev $TESTPOOL verify_inherited 'volmode' 'dev' $ZVOL $TESTPOOL blockdev_exists $ZDEV # 6.4 Check volmode=default case sysctl_volmode 1 log_must zfs set volmode=default $TESTPOOL verify_inherited 'volmode' 'default' $ZVOL $TESTPOOL blockdev_exists $ZDEV # 6.5 Check inheritance on multiple levels log_must zfs inherit volmode $SUBZVOL -udev_wait log_must zfs set volmode=none $VOLFS -udev_wait log_must zfs set volmode=full $TESTPOOL verify_inherited 'volmode' 'none' $SUBZVOL $VOLFS blockdev_missing $SUBZDEV blockdev_exists $ZDEV # 7. Verify "volmode" behaves correctly at import time log_must zpool export $TESTPOOL blockdev_missing $ZDEV blockdev_missing $SUBZDEV log_must zpool import $TESTPOOL blockdev_exists $ZDEV blockdev_missing $SUBZDEV log_must_busy zfs destroy $ZVOL log_must_busy zfs destroy $SUBZVOL +blockdev_missing $ZDEV +blockdev_missing $SUBZDEV # 8. Verify "volmode" behaves accordingly to zvol_inhibit_dev (Linux only) if is_linux; then sysctl_inhibit_dev 1 # 7.1 Verify device nodes not are not created with "volmode=full" sysctl_volmode 1 log_must zfs create -V $VOLSIZE -s $ZVOL blockdev_missing $ZDEV log_must zfs set volmode=full $ZVOL blockdev_missing $ZDEV log_must_busy zfs destroy $ZVOL + blockdev_missing $ZDEV # 7.1 Verify device nodes not are not created with "volmode=dev" sysctl_volmode 2 log_must zfs create -V $VOLSIZE -s $ZVOL blockdev_missing $ZDEV log_must zfs set volmode=dev $ZVOL blockdev_missing $ZDEV log_must_busy zfs destroy $ZVOL + blockdev_missing $ZDEV # 7.1 Verify device nodes not are not created with "volmode=none" sysctl_volmode 3 log_must zfs create -V $VOLSIZE -s $ZVOL blockdev_missing $ZDEV log_must zfs set volmode=none $ZVOL blockdev_missing $ZDEV fi log_pass "Verify that ZFS volume property 'volmode' works as intended" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh index b8989f478727..a393606d0fa9 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh @@ -1,74 +1,74 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2017, loli10K . All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/zvol/zvol_common.shlib . $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib # # DESCRIPTION: # Verify ZIL functionality on ZVOLs # # STRATEGY: # 1. Create a ZVOLs with various combination of "logbias" and "sync" values # 2. Write data to ZVOL device node # 3. Verify we don't trigger any issue like the one reported in #6238 # verify_runnable "global" function cleanup { - datasetexists $ZVOL && log_must_busy zfs destroy $ZVOL - udev_wait + datasetexists $ZVOL && destroy_dataset $ZVOL + block_device_wait } log_assert "Verify ZIL functionality on ZVOLs" log_onexit cleanup ZVOL="$TESTPOOL/vol" ZDEV="$ZVOL_DEVDIR/$ZVOL" typeset -a logbias_prop_vals=('latency' 'throughput') typeset -a sync_prop_vals=('standard' 'always' 'disabled') for logbias in ${logbias_prop_vals[@]}; do for sync in ${sync_prop_vals[@]}; do # 1. Create a ZVOL with logbias=throughput and sync=always log_must zfs create -V $VOLSIZE -b 128K -o sync=$sync \ -o logbias=$logbias $ZVOL # 2. Write data to its device node for i in {1..50}; do dd if=/dev/zero of=$ZDEV bs=8k count=1 & done # 3. Verify we don't trigger any issue log_must wait log_must_busy zfs destroy $ZVOL done done log_pass "ZIL functionality works on ZVOLs" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/perf.shlib b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/perf.shlib index 6f4fdc94348f..5a404df083e4 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/perf.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/perf.shlib @@ -1,602 +1,601 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # Copyright (c) 2016, Intel Corporation. # . $STF_SUITE/include/libtest.shlib # Defaults common to all the tests in the regression group export PERF_RUNTIME=${PERF_RUNTIME:-'180'} export PERF_RANDSEED=${PERF_RANDSEED:-'1234'} export PERF_COMPPERCENT=${PERF_COMPPERCENT:-'66'} export PERF_COMPCHUNK=${PERF_COMPCHUNK:-'4096'} -export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Default to JSON for fio output export PERF_FIO_FORMAT=${PERF_FIO_FORMAT:-'json'} # Default fs creation options export PERF_FS_OPTS=${PERF_FS_OPTS:-'-o recsize=8k -o compress=lz4' \ ' -o checksum=sha256 -o redundant_metadata=most'} function get_sync_str { typeset sync=$1 typeset sync_str='' [[ $sync -eq 0 ]] && sync_str='async' [[ $sync -eq 1 ]] && sync_str='sync' echo $sync_str } function get_suffix { typeset threads=$1 typeset sync=$2 typeset iosize=$3 typeset sync_str=$(get_sync_str $sync) typeset filesystems=$(get_nfilesystems) typeset suffix="$sync_str.$iosize-ios" suffix="$suffix.$threads-threads.$filesystems-filesystems" echo $suffix } function do_fio_run_impl { typeset script=$1 typeset do_recreate=$2 typeset clear_cache=$3 typeset threads=$4 typeset threads_per_fs=$5 typeset sync=$6 typeset iosize=$7 typeset sync_str=$(get_sync_str $sync) log_note "Running with $threads $sync_str threads, $iosize ios" if [[ -n $threads_per_fs && $threads_per_fs -ne 0 ]]; then log_must test $do_recreate verify_threads_per_fs $threads $threads_per_fs fi if $do_recreate; then recreate_perf_pool # # A value of zero for "threads_per_fs" is "special", and # means a single filesystem should be used, regardless # of the number of threads. # if [[ -n $threads_per_fs && $threads_per_fs -ne 0 ]]; then populate_perf_filesystems $((threads / threads_per_fs)) else populate_perf_filesystems 1 fi fi if $clear_cache; then # Clear the ARC log_must zinject -a fi if [[ -n $ZINJECT_DELAYS ]]; then apply_zinject_delays else log_note "No per-device commands to execute." fi # # Allow this to be overridden by the individual test case. This # can be used to run the FIO job against something other than # the default filesystem (e.g. against a clone). # export DIRECTORY=$(get_directory) log_note "DIRECTORY: " $DIRECTORY export RUNTIME=$PERF_RUNTIME export RANDSEED=$PERF_RANDSEED export COMPPERCENT=$PERF_COMPPERCENT export COMPCHUNK=$PERF_COMPCHUNK export FILESIZE=$((TOTAL_SIZE / threads)) export NUMJOBS=$threads export SYNC_TYPE=$sync export BLOCKSIZE=$iosize sync # When running locally, we want to keep the default behavior of # DIRECT == 0, so only set it when we're running over NFS to # disable client cache for reads. if [[ $NFS -eq 1 ]]; then export DIRECT=1 do_setup_nfs $script else export DIRECT=0 fi # This will be part of the output filename. typeset suffix=$(get_suffix $threads $sync $iosize) # Start the data collection do_collect_scripts $suffix # Define output file typeset logbase="$(get_perf_output_dir)/$(basename \ $SUDO_COMMAND)" typeset outfile="$logbase.fio.$suffix" # Start the load if [[ $NFS -eq 1 ]]; then log_must ssh -t $NFS_USER@$NFS_CLIENT " fio --output-format=${PERF_FIO_FORMAT} \ --output /tmp/fio.out /tmp/test.fio " log_must scp $NFS_USER@$NFS_CLIENT:/tmp/fio.out $outfile log_must ssh -t $NFS_USER@$NFS_CLIENT "sudo -S umount $NFS_MOUNT" else log_must fio --output-format=${PERF_FIO_FORMAT} \ --output $outfile $FIO_SCRIPTS/$script fi } # # This function will run fio in a loop, according to the .fio file passed # in and a number of environment variables. The following variables can be # set before launching zfstest to override the defaults. # # PERF_RUNTIME: The time in seconds each fio invocation should run. # PERF_NTHREADS: A list of how many threads each fio invocation will use. # PERF_SYNC_TYPES: Whether to use (O_SYNC) or not. 1 is sync IO, 0 is async IO. # PERF_IOSIZES: A list of blocksizes in which each fio invocation will do IO. # PERF_COLLECT_SCRIPTS: A comma delimited list of 'command args, logfile_tag' # pairs that will be added to the scripts specified in each test. # function do_fio_run { typeset script=$1 typeset do_recreate=$2 typeset clear_cache=$3 typeset threads threads_per_fs sync iosize for threads in $PERF_NTHREADS; do for threads_per_fs in $PERF_NTHREADS_PER_FS; do for sync in $PERF_SYNC_TYPES; do for iosize in $PERF_IOSIZES; do do_fio_run_impl \ $script \ $do_recreate \ $clear_cache \ $threads \ $threads_per_fs \ $sync \ $iosize done done done done } # This function sets NFS mount on the client and make sure all correct # permissions are in place # function do_setup_nfs { typeset script=$1 zfs set sharenfs=on $TESTFS log_must chmod -R 777 /$TESTFS ssh -t $NFS_USER@$NFS_CLIENT "mkdir -m 777 -p $NFS_MOUNT" ssh -t $NFS_USER@$NFS_CLIENT "sudo -S umount $NFS_MOUNT" log_must ssh -t $NFS_USER@$NFS_CLIENT " sudo -S mount $NFS_OPTIONS $NFS_SERVER:/$TESTFS $NFS_MOUNT " # # The variables in the fio script are only available in our current # shell session, so we have to evaluate them here before copying # the resulting script over to the target machine. # export jobnum='$jobnum' while read line; do eval echo "$line" done < $FIO_SCRIPTS/$script > /tmp/test.fio log_must sed -i -e "s%directory.*%directory=$NFS_MOUNT%" /tmp/test.fio log_must scp /tmp/test.fio $NFS_USER@$NFS_CLIENT:/tmp log_must rm /tmp/test.fio } # # This function iterates through the value pairs in $PERF_COLLECT_SCRIPTS. # The script at index N is launched in the background, with its output # redirected to a logfile containing the tag specified at index N + 1. # function do_collect_scripts { typeset suffix=$1 [[ -n $collect_scripts ]] || log_fail "No data collection scripts." [[ -n $PERF_RUNTIME ]] || log_fail "No runtime specified." # Add in user supplied scripts and logfiles, if any. typeset oIFS=$IFS IFS=',' for item in $PERF_COLLECT_SCRIPTS; do collect_scripts+=($(echo $item | sed 's/^ *//g')) done IFS=$oIFS typeset idx=0 while [[ $idx -lt "${#collect_scripts[@]}" ]]; do typeset logbase="$(get_perf_output_dir)/$(basename \ $SUDO_COMMAND)" typeset outfile="$logbase.${collect_scripts[$idx + 1]}.$suffix" timeout $PERF_RUNTIME ${collect_scripts[$idx]} >$outfile 2>&1 & ((idx += 2)) done # Need to explicitly return 0 because timeout(1) will kill # a child process and cause us to return non-zero. return 0 } # Find a place to deposit performance data collected while under load. function get_perf_output_dir { typeset dir="$(pwd)/perf_data" [[ -d $dir ]] || mkdir -p $dir echo $dir } function apply_zinject_delays { typeset idx=0 while [[ $idx -lt "${#ZINJECT_DELAYS[@]}" ]]; do [[ -n ${ZINJECT_DELAYS[$idx]} ]] || \ log_must "No zinject delay found at index: $idx" for disk in $DISKS; do log_must zinject \ -d $disk -D ${ZINJECT_DELAYS[$idx]} $PERFPOOL done ((idx += 1)) done } function clear_zinject_delays { log_must zinject -c all } # # Destroy and create the pool used for performance tests. # function recreate_perf_pool { [[ -n $PERFPOOL ]] || log_fail "The \$PERFPOOL variable isn't set." # # In case there's been some "leaked" zinject delays, or if the # performance test injected some delays itself, we clear all # delays before attempting to destroy the pool. Each delay # places a hold on the pool, so the destroy will fail if there # are any outstanding delays. # clear_zinject_delays # # This function handles the case where the pool already exists, # and will destroy the previous pool and recreate a new pool. # create_pool $PERFPOOL $DISKS } function verify_threads_per_fs { typeset threads=$1 typeset threads_per_fs=$2 log_must test -n $threads log_must test -n $threads_per_fs # # A value of "0" is treated as a "special value", and it is # interpreted to mean all threads will run using a single # filesystem. # [[ $threads_per_fs -eq 0 ]] && return # # The number of threads per filesystem must be a value greater # than or equal to zero; since we just verified the value isn't # 0 above, then it must be greater than zero here. # log_must test $threads_per_fs -ge 0 # # This restriction can be lifted later if needed, but for now, # we restrict the number of threads per filesystem to a value # that evenly divides the thread count. This way, the threads # will be evenly distributed over all the filesystems. # log_must test $((threads % threads_per_fs)) -eq 0 } function populate_perf_filesystems { typeset nfilesystems=${1:-1} export TESTFS="" for i in $(seq 1 $nfilesystems); do typeset dataset="$PERFPOOL/fs$i" create_dataset $dataset $PERF_FS_OPTS if [[ -z "$TESTFS" ]]; then TESTFS="$dataset" else TESTFS="$TESTFS $dataset" fi done } function get_nfilesystems { typeset filesystems=( $TESTFS ) echo ${#filesystems[@]} } function get_directory { typeset filesystems=( $TESTFS ) typeset directory= typeset idx=0 while [[ $idx -lt "${#filesystems[@]}" ]]; do mountpoint=$(get_prop mountpoint "${filesystems[$idx]}") if [[ -n $directory ]]; then directory=$directory:$mountpoint else directory=$mountpoint fi ((idx += 1)) done echo $directory } function get_min_arc_size { typeset -l min_arc_size if is_freebsd; then min_arc_size=$(sysctl -n kstat.zfs.misc.arcstats.c_min) elif is_illumos; then min_arc_size=$(dtrace -qn 'BEGIN { printf("%u\n", `arc_stats.arcstat_c_min.value.ui64); exit(0); }') elif is_linux; then min_arc_size=`awk '$1 == "c_min" { print $3 }' \ /proc/spl/kstat/zfs/arcstats` fi [[ $? -eq 0 ]] || log_fail "get_min_arc_size failed" echo $min_arc_size } function get_max_arc_size { typeset -l max_arc_size if is_freebsd; then max_arc_size=$(sysctl -n kstat.zfs.misc.arcstats.c_max) elif is_illumos; then max_arc_size=$(dtrace -qn 'BEGIN { printf("%u\n", `arc_stats.arcstat_c_max.value.ui64); exit(0); }') elif is_linux; then max_arc_size=`awk '$1 == "c_max" { print $3 }' \ /proc/spl/kstat/zfs/arcstats` fi [[ $? -eq 0 ]] || log_fail "get_max_arc_size failed" echo $max_arc_size } function get_arc_target { typeset -l arc_c if is_freebsd; then arc_c=$(sysctl -n kstat.zfs.misc.arcstats.c) elif is_illumos; then arc_c=$(dtrace -qn 'BEGIN { printf("%u\n", `arc_stats.arcstat_c.value.ui64); exit(0); }') elif is_linux; then arc_c=`awk '$1 == "c" { print $3 }' \ /proc/spl/kstat/zfs/arcstats` fi [[ $? -eq 0 ]] || log_fail "get_arc_target failed" echo $arc_c } function get_dbuf_cache_size { typeset -l dbuf_cache_size dbuf_cache_shift if is_illumos; then dbuf_cache_size=$(dtrace -qn 'BEGIN { printf("%u\n", `dbuf_cache_max_bytes); exit(0); }') else dbuf_cache_shift=$(get_tunable DBUF_CACHE_SHIFT) dbuf_cache_size=$(($(get_arc_target) / 2**dbuf_cache_shift)) fi [[ $? -eq 0 ]] || log_fail "get_dbuf_cache_size failed" echo $dbuf_cache_size } # Create a file with some information about how this system is configured. function get_system_config { typeset config=$PERF_DATA_DIR/$1 echo "{" >>$config if is_linux; then echo " \"ncpus\": \"$(nproc --all)\"," >>$config echo " \"physmem\": \"$(free -b | \ awk '$1 == "Mem:" { print $2 }')\"," >>$config echo " \"c_max\": \"$(get_max_arc_size)\"," >>$config echo " \"hostname\": \"$(uname -n)\"," >>$config echo " \"kernel version\": \"$(uname -sr)\"," >>$config else dtrace -qn 'BEGIN{ printf(" \"ncpus\": %d,\n", `ncpus); printf(" \"physmem\": %u,\n", `physmem * `_pagesize); printf(" \"c_max\": %u,\n", `arc_stats.arcstat_c_max.value.ui64); printf(" \"kmem_flags\": \"0x%x\",", `kmem_flags); exit(0)}' >>$config echo " \"hostname\": \"$(uname -n)\"," >>$config echo " \"kernel version\": \"$(uname -v)\"," >>$config fi if is_linux; then lsblk -dino NAME,SIZE | awk 'BEGIN { printf(" \"disks\": {\n"); first = 1} {disk = $1} {size = $2; if (first != 1) {printf(",\n")} else {first = 0} printf(" \"%s\": \"%s\"", disk, size)} END {printf("\n },\n")}' >>$config zfs_tunables="/sys/module/zfs/parameters" printf " \"tunables\": {\n" >>$config for tunable in \ zfs_arc_max \ zfs_arc_meta_limit \ zfs_arc_sys_free \ zfs_dirty_data_max \ zfs_flags \ zfs_prefetch_disable \ zfs_txg_timeout \ zfs_vdev_aggregation_limit \ zfs_vdev_async_read_max_active \ zfs_vdev_async_write_max_active \ zfs_vdev_sync_read_max_active \ zfs_vdev_sync_write_max_active \ zio_slow_io_ms do if [ "$tunable" != "zfs_arc_max" ] then printf ",\n" >>$config fi printf " \"$tunable\": \"$(<$zfs_tunables/$tunable)\"" \ >>$config done printf "\n }\n" >>$config else iostat -En | awk 'BEGIN { printf(" \"disks\": {\n"); first = 1} /^c/ {disk = $1} /^Size: [^0]/ {size = $2; if (first != 1) {printf(",\n")} else {first = 0} printf(" \"%s\": \"%s\"", disk, size)} END {printf("\n },\n")}' >>$config sed -n 's/^set \(.*\)[ ]=[ ]\(.*\)/\1=\2/p' /etc/system | \ awk -F= 'BEGIN {printf(" \"system\": {\n"); first = 1} {if (first != 1) {printf(",\n")} else {first = 0}; printf(" \"%s\": %s", $1, $2)} END {printf("\n }\n")}' >>$config fi echo "}" >>$config } function num_jobs_by_cpu { if is_linux; then typeset ncpu=$($NPROC --all) else typeset ncpu=$(psrinfo | $WC -l) fi typeset num_jobs=$ncpu [[ $ncpu -gt 8 ]] && num_jobs=$(echo "$ncpu * 3 / 4" | bc) echo $num_jobs } # # On illumos this looks like: ":sd3:sd4:sd1:sd2:" # function pool_to_lun_list { typeset pool=$1 typeset ctd ctds devname lun typeset lun_list=':' if is_illumos; then ctds=$(zpool list -v $pool | awk '/c[0-9]*t[0-9a-fA-F]*d[0-9]*/ {print $1}') for ctd in $ctds; do # Get the device name as it appears in /etc/path_to_inst devname=$(readlink -f /dev/dsk/${ctd}s0 | sed -n \ 's/\/devices\([^:]*\):.*/\1/p') # Add a string composed of the driver name and instance # number to the list for comparison with dev_statname. lun=$(sed 's/"//g' /etc/path_to_inst | grep \ $devname | awk '{print $3$2}') lun_list="$lun_list$lun:" done elif is_freebsd; then lun_list+=$(zpool list -HLv $pool | \ awk '/a?da[0-9]+|md[0-9]+|mfid[0-9]+|nda[0-9]+|nvd[0-9]+|vtbd[0-9]+/ { printf "%s:", $1 }') elif is_linux; then ctds=$(zpool list -HLv $pool | \ awk '/sd[a-z]*|loop[0-9]*|dm-[0-9]*/ {print $1}') for ctd in $ctds; do lun_list="$lun_list$ctd:" done fi echo $lun_list } function print_perf_settings { echo "PERF_NTHREADS: $PERF_NTHREADS" echo "PERF_NTHREADS_PER_FS: $PERF_NTHREADS_PER_FS" echo "PERF_SYNC_TYPES: $PERF_SYNC_TYPES" echo "PERF_IOSIZES: $PERF_IOSIZES" } # Create a perf_data directory to hold performance statistics and # configuration information. export PERF_DATA_DIR=$(get_perf_output_dir) [[ -f $PERF_DATA_DIR/config.json ]] || get_system_config config.json diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_reads.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_reads.ksh index 5c8066d17549..70e273166161 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_reads.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_reads.ksh @@ -1,96 +1,97 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the random_reads job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # The files to read from are created prior to the first fio run, and used # for all fio runs. The ARC is cleared with `zinject -a` prior to each run # so reads will go to disk. # # Thread/Concurrency settings: # PERF_NTHREADS defines the number of files created in the test filesystem, # as well as the number of threads that will simultaneously drive IO to # those files. The settings chosen are from measurements in the # PerfAutoESX/ZFSPerfESX Environments, selected at concurrency levels that # are at peak throughput but lowest latency. Higher concurrency introduces # queue time latency and would reduce the impact of code-induced performance # regressions. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. export TOTAL_SIZE=$(($(get_prop avail $PERFPOOL) * 3 / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'16 32'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Layout the files to be used by the read tests. Create as many files as the # largest number of threads. An fio run with fewer threads will use a subset # of the available files. export NUMJOBS=$(get_max $PERF_NTHREADS) export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) export DIRECTORY=$(get_directory) log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" ) fi log_note "Random reads with settings: $(print_perf_settings)" do_fio_run random_reads.fio false true log_pass "Measure IO stats during random read load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh index 33d7d8c8d945..4dc1e3ddba9a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh @@ -1,96 +1,97 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the random_readwrite job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # The files to read and write from are created prior to the first fio run, # and used for all fio runs. The ARC is cleared with `zinject -a` prior to # each run so reads will go to disk. # # Thread/Concurrency settings: # PERF_NTHREADS defines the number of files created in the test filesystem, # as well as the number of threads that will simultaneously drive IO to # those files. The settings chosen are from measurements in the # PerfAutoESX/ZFSPerfESX Environments, selected at concurrency levels that # are at peak throughput but lowest latency. Higher concurrency introduces # queue time latency and would reduce the impact of code-induced performance # regressions. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. export TOTAL_SIZE=$(($(get_prop avail $PERFPOOL) * 3 / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'32 64'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} export PERF_IOSIZES='' # bssplit used instead +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Layout the files to be used by the readwrite tests. Create as many files # as the largest number of threads. An fio run with fewer threads will use # a subset of the available files. export NUMJOBS=$(get_max $PERF_NTHREADS) export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) export DIRECTORY=$(get_directory) log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" ) fi log_note "Random reads and writes with settings: $(print_perf_settings)" do_fio_run random_readwrite.fio false true log_pass "Measure IO stats during random read and write load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite_fixed.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite_fixed.ksh index bb4014563f1f..cdf15c701277 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite_fixed.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_readwrite_fixed.ksh @@ -1,88 +1,89 @@ #!/bin/ksh # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2017, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the random_readwrite_fixed job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # The files to read and write from are created prior to the first fio run, # and used for all fio runs. The ARC is cleared with `zinject -a` prior to # each run so reads will go to disk. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read write load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. export TOTAL_SIZE=$(($(get_prop avail $PERFPOOL) * 3 / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} # Layout the files to be used by the readwrite tests. Create as many files # as the largest number of threads. An fio run with fewer threads will use # a subset of the available files. export NUMJOBS=$(get_max $PERF_NTHREADS) export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) export DIRECTORY=$(get_directory) log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "kstat zfs:0 1" "kstat" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" "dtrace -Cs $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "dtrace -s $PERF_SCRIPTS/profile.d" "profile" ) fi log_note "Random reads and writes with settings: $(print_perf_settings)" do_fio_run random_readwrite_fixed.fio false true log_pass "Measure IO stats during random read and write load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes.ksh index 4b826835efbf..30db7564c2dc 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes.ksh @@ -1,87 +1,88 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the random_writes job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # Prior to each fio run the dataset is recreated, and fio writes new files # into an otherwise empty pool. # # Thread/Concurrency settings: # PERF_NTHREADS defines the number of files created in the test filesystem, # as well as the number of threads that will simultaneously drive IO to # those files. The settings chosen are from measurements in the # PerfAutoESX/ZFSPerfESX Environments, selected at concurrency levels that # are at peak throughput but lowest latency. Higher concurrency introduces # queue time latency and would reduce the impact of code-induced performance # regressions. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. export TOTAL_SIZE=$(($(get_prop avail $PERFPOOL) * 3 / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'32 128'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" ) fi log_note "Random writes with settings: $(print_perf_settings)" do_fio_run random_writes.fio true false log_pass "Measure IO stats during random write load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes_zil.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes_zil.ksh index 522ee4526828..ff6d465bb722 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes_zil.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/random_writes_zil.ksh @@ -1,83 +1,84 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat # # We're using many filesystems depending on the number of # threads for each test, and there's no good way to get a list # of all the filesystems that should be destroyed on cleanup # (i.e. the list of filesystems used for the last test ran). # Thus, we simply recreate the pool as a way to destroy all # filesystems and leave a fresh pool behind. # recreate_perf_pool } trap "log_fail \"Measure IO stats during random write load\"" SIGTERM log_onexit cleanup recreate_perf_pool # Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. export TOTAL_SIZE=$(($(get_prop avail $PERFPOOL) * 3 / 2)) # Variables specific to this test for use by fio. -export PERF_NTHREADS=${PERF_NTHREADS:-'1 4 16 64'} +export PERF_NTHREADS=${PERF_NTHREADS:-'1 16 64'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0 1'} export PERF_IOSIZES=${PERF_IOSIZES:-'8k'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Until the performance tests over NFS can deal with multiple file systems, # force the use of only one file system when testing over NFS. [[ $NFS -eq 1 ]] && PERF_NTHREADS_PER_FS='0' lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "kstat zfs:0 1" "kstat" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" "dtrace -Cs $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "dtrace -s $PERF_SCRIPTS/zil.d $PERFPOOL 1" "zil" "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "dtrace -s $PERF_SCRIPTS/offcpu-profile.d" "offcpu-profile" ) fi log_note \ "ZIL specific random write workload with settings: $(print_perf_settings)" do_fio_run random_writes.fio true false log_pass "Measure IO stats during ZIL specific random write workload" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh index 2bdfff736f4e..cc6d17245239 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh @@ -1,98 +1,99 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the sequential_reads job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # The files to read from are created prior to the first fio run, and used # for all fio runs. The ARC is cleared with `zinject -a` prior to each run # so reads will go to disk. # # Thread/Concurrency settings: # PERF_NTHREADS defines the number of files created in the test filesystem, # as well as the number of threads that will simultaneously drive IO to # those files. The settings chosen are from measurements in the # PerfAutoESX/ZFSPerfESX Environments, selected at concurrency levels that # are at peak throughput but lowest latency. Higher concurrency introduces # queue time latency and would reduce the impact of code-induced performance # regressions. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. export TOTAL_SIZE=$(($(get_prop avail $PERFPOOL) * 3 / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'8 16'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Layout the files to be used by the read tests. Create as many files as the # largest number of threads. An fio run with fewer threads will use a subset # of the available files. export NUMJOBS=$(get_max $PERF_NTHREADS) export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) export DIRECTORY=$(get_directory) log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "$PERF_SCRIPTS/prefetch_io.sh $PERFPOOL 1" "prefetch" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" ) fi log_note "Sequential reads with settings: $(print_perf_settings)" do_fio_run sequential_reads.fio false true log_pass "Measure IO stats during sequential read load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh index 8127786361ba..d885d0b7f992 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh @@ -1,88 +1,89 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the sequential_reads job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # The files to read from are created prior to the first fio run, and used # for all fio runs. The ARC is not cleared to ensure that all data is cached. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Make sure the working set can be cached in the arc. Aim for 1/2 of arc. export TOTAL_SIZE=$(($(get_max_arc_size) / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} -export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +export PERF_IOSIZES=${PERF_IOSIZES:-'128k'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Layout the files to be used by the read tests. Create as many files as the # largest number of threads. An fio run with fewer threads will use a subset # of the available files. export NUMJOBS=$(get_max $PERF_NTHREADS) export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) export DIRECTORY=$(get_directory) log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "$PERF_SCRIPTS/prefetch_io.sh $PERFPOOL 1" "prefetch" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" ) fi log_note "Sequential cached reads with settings: $(print_perf_settings)" do_fio_run sequential_reads.fio false false log_pass "Measure IO stats during sequential cached read load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh index 8ce1273c2869..80081a098e8f 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh @@ -1,115 +1,116 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the sequential_reads job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # The files to read from are created prior to the first fio run, and used # for all fio runs. This test will exercise cached read performance from # a clone filesystem. The data is initially cached in the ARC and then # a snapshot and clone are created. All the performance runs are then # initiated against the clone filesystem to exercise the performance of # reads when the ARC has to create another buffer from a different dataset. # It will also exercise the need to evict the duplicate buffer once the last # reference on that buffer is released. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Make sure the working set can be cached in the arc. Aim for 1/2 of arc. export TOTAL_SIZE=$(($(get_max_arc_size) / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'64 128'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} -export PERF_IOSIZES=${PERF_IOSIZES:-'128k 1m'} +export PERF_IOSIZES=${PERF_IOSIZES:-'128k'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Layout the files to be used by the read tests. Create as many files as the # largest number of threads. An fio run with fewer threads will use a subset # of the available files. export NUMJOBS=$(get_max $PERF_NTHREADS) export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) export DIRECTORY=$(get_directory) log_must fio $FIO_SCRIPTS/mkfiles.fio # # Only a single filesystem is used by this test. To be defensive, we # double check that TESTFS only contains a single filesystem. We # wouldn't want to assume this was the case, and have it actually # contain multiple filesystem (causing cascading failures later). # log_must test $(get_nfilesystems) -eq 1 log_note "Creating snapshot, $TESTSNAP, of $TESTFS" create_snapshot $TESTFS $TESTSNAP log_note "Creating clone, $PERFPOOL/$TESTCLONE, from $TESTFS@$TESTSNAP" create_clone $TESTFS@$TESTSNAP $PERFPOOL/$TESTCLONE # # We want to run FIO against the clone we created above, and not the # clone's originating filesystem. Thus, we override the default behavior # and explicitly set TESTFS to the clone. # export TESTFS=$PERFPOOL/$TESTCLONE # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "$PERF_SCRIPTS/prefetch_io.sh $PERFPOOL 1" "prefetch" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" ) fi log_note "Sequential cached reads from $DIRECTORY with " \ "ettings: $(print_perf_settings)" do_fio_run sequential_reads.fio false false log_pass "Measure IO stats during sequential cached read load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh index adacdc29799c..0402b48a3e9c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh @@ -1,94 +1,95 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2016, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the sequential_reads job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # The files to read from are created prior to the first fio run, and used # for all fio runs. The ARC is not cleared to ensure that all data is cached. # # This is basically a copy of the sequential_reads_cached test case, but with # a smaller dataset so that we can fit everything into the decompressed, linear # space in the dbuf cache. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during sequential read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Ensure the working set can be cached in the dbuf cache. export TOTAL_SIZE=$(($(get_dbuf_cache_size) * 3 / 4)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'64'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} export PERF_IOSIZES=${PERF_IOSIZES:-'64k'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} # Layout the files to be used by the read tests. Create as many files as the # largest number of threads. An fio run with fewer threads will use a subset # of the available files. export NUMJOBS=$(get_max $PERF_NTHREADS) export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) export DIRECTORY=$(get_directory) log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "$PERF_SCRIPTS/prefetch_io.sh $PERFPOOL 1" "prefetch" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "kstat zfs:0 1" "kstat" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" "dtrace -Cs $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "dtrace -s $PERF_SCRIPTS/profile.d" "profile" ) fi log_note "Sequential cached reads with settings: $(print_perf_settings)" do_fio_run sequential_reads.fio false false log_pass "Measure IO stats during sequential cached read load" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh index d32690a0542e..7850bc0375df 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh @@ -1,87 +1,88 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2015, 2021 by Delphix. All rights reserved. # # # Description: # Trigger fio runs using the sequential_writes job file. The number of runs and # data collected is determined by the PERF_* variables. See do_fio_run for # details about these variables. # # Prior to each fio run the dataset is recreated, and fio writes new files # into an otherwise empty pool. # # Thread/Concurrency settings: # PERF_NTHREADS defines the number of files created in the test filesystem, # as well as the number of threads that will simultaneously drive IO to # those files. The settings chosen are from measurements in the # PerfAutoESX/ZFSPerfESX Environments, selected at concurrency levels that # are at peak throughput but lowest latency. Higher concurrency introduces # queue time latency and would reduce the impact of code-induced performance # regressions. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/perf/perf.shlib function cleanup { # kill fio and iostat pkill fio pkill iostat recreate_perf_pool } trap "log_fail \"Measure IO stats during random read load\"" SIGTERM log_onexit cleanup recreate_perf_pool populate_perf_filesystems # Aim to fill the pool to 50% capacity while accounting for a 3x compressratio. export TOTAL_SIZE=$(($(get_prop avail $PERFPOOL) * 3 / 2)) # Variables specific to this test for use by fio. export PERF_NTHREADS=${PERF_NTHREADS:-'16 32'} export PERF_NTHREADS_PER_FS=${PERF_NTHREADS_PER_FS:-'0'} -export PERF_IOSIZES=${PERF_IOSIZES:-'8k 128k 1m'} +export PERF_IOSIZES=${PERF_IOSIZES:-'8k 1m'} +export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'0 1'} # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" if is_linux; then typeset perf_record_cmd="perf record -F 99 -a -g -q \ -o /dev/stdout -- sleep ${PERF_RUNTIME}" export collect_scripts=( "zpool iostat -lpvyL $PERFPOOL 1" "zpool.iostat" "vmstat -t 1" "vmstat" "mpstat -P ALL 1" "mpstat" "iostat -tdxyz 1" "iostat" "$perf_record_cmd" "perf" ) else export collect_scripts=( "$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" "vmstat -T d 1" "vmstat" "mpstat -T d 1" "mpstat" "iostat -T d -xcnz 1" "iostat" ) fi log_note "Sequential writes with settings: $(print_perf_settings)" do_fio_run sequential_writes.fio true false log_pass "Measure IO stats during sequential write load" diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index 82f6d415f966..8bbe141827db 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1,849 +1,864 @@ /* * $FreeBSD$ */ /* zfs_config.h. Generated from zfs_config.h.in by configure. */ /* zfs_config.h.in. Generated from configure.ac by autoheader. */ /* Define to 1 if translation of program messages to the user's native language is requested. */ /* #undef ENABLE_NLS */ /* bio_end_io_t wants 1 arg */ /* #undef HAVE_1ARG_BIO_END_IO_T */ /* lookup_bdev() wants 1 arg */ /* #undef HAVE_1ARG_LOOKUP_BDEV */ /* submit_bio() wants 1 arg */ /* #undef HAVE_1ARG_SUBMIT_BIO */ /* bdi_setup_and_register() wants 2 args */ /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 2 args */ /* #undef HAVE_2ARGS_VFS_GETATTR */ /* zlib_deflate_workspacesize() wants 2 args */ /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ /* bdi_setup_and_register() wants 3 args */ /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 3 args */ /* #undef HAVE_3ARGS_VFS_GETATTR */ /* vfs_getattr wants 4 args */ /* #undef HAVE_4ARGS_VFS_GETATTR */ /* kernel has access_ok with 'type' parameter */ /* #undef HAVE_ACCESS_OK_TYPE */ /* posix_acl has refcount_t */ /* #undef HAVE_ACL_REFCOUNT */ /* Define if host toolchain supports AES */ #define HAVE_AES 1 #ifdef __amd64__ #ifndef RESCUE /* Define if host toolchain supports AVX */ #define HAVE_AVX 1 #endif /* Define if host toolchain supports AVX2 */ #define HAVE_AVX2 1 /* Define if host toolchain supports AVX512BW */ #define HAVE_AVX512BW 1 /* Define if host toolchain supports AVX512CD */ #define HAVE_AVX512CD 1 /* Define if host toolchain supports AVX512DQ */ #define HAVE_AVX512DQ 1 /* Define if host toolchain supports AVX512ER */ #define HAVE_AVX512ER 1 /* Define if host toolchain supports AVX512F */ #define HAVE_AVX512F 1 /* Define if host toolchain supports AVX512IFMA */ #define HAVE_AVX512IFMA 1 /* Define if host toolchain supports AVX512PF */ #define HAVE_AVX512PF 1 /* Define if host toolchain supports AVX512VBMI */ #define HAVE_AVX512VBMI 1 /* Define if host toolchain supports AVX512VL */ #define HAVE_AVX512VL 1 #endif /* bdev_check_media_change() exists */ /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */ /* bdev_whole() is available */ /* #undef HAVE_BDEV_WHOLE */ /* bio->bi_bdev->bd_disk exists */ /* #undef HAVE_BIO_BDEV_DISK */ /* bio->bi_opf is defined */ /* #undef HAVE_BIO_BI_OPF */ /* bio->bi_status exists */ /* #undef HAVE_BIO_BI_STATUS */ /* bio has bi_iter */ /* #undef HAVE_BIO_BVEC_ITER */ /* bio_*_io_acct() available */ /* #undef HAVE_BIO_IO_ACCT */ /* bio_max_segs() is implemented */ /* #undef HAVE_BIO_MAX_SEGS */ /* bio_set_dev() is available */ /* #undef HAVE_BIO_SET_DEV */ /* bio_set_dev() GPL-only */ /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */ /* bio_set_op_attrs is available */ /* #undef HAVE_BIO_SET_OP_ATTRS */ /* blkdev_reread_part() exists */ /* #undef HAVE_BLKDEV_REREAD_PART */ /* blkg_tryget() is available */ /* #undef HAVE_BLKG_TRYGET */ /* blkg_tryget() GPL-only */ /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */ /* blk_alloc_disk() exists */ /* #undef HAVE_BLK_ALLOC_DISK */ /* blk_alloc_queue() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */ /* blk_alloc_queue_rh() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */ /* blk queue backing_dev_info is dynamic */ /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */ /* blk_queue_flag_clear() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */ /* blk_queue_flag_set() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_SET */ /* blk_queue_flush() is available */ /* #undef HAVE_BLK_QUEUE_FLUSH */ /* blk_queue_flush() is GPL-only */ /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ /* blk_queue_secdiscard() is available */ /* #undef HAVE_BLK_QUEUE_SECDISCARD */ /* blk_queue_secure_erase() is available */ /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */ +/* blk_queue_update_readahead() exists */ +/* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */ + /* blk_queue_write_cache() exists */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */ /* blk_queue_write_cache() is GPL-only */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ /* Define if revalidate_disk() in block_device_operations */ /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYCURRENT */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */ /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in the CoreFoundation framework. */ /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */ /* check_disk_change() exists */ /* #undef HAVE_CHECK_DISK_CHANGE */ /* clear_inode() is available */ /* #undef HAVE_CLEAR_INODE */ /* dentry uses const struct dentry_operations */ /* #undef HAVE_CONST_DENTRY_OPERATIONS */ /* copy_from_iter() is available */ /* #undef HAVE_COPY_FROM_ITER */ /* copy_to_iter() is available */ /* #undef HAVE_COPY_TO_ITER */ /* yes */ /* #undef HAVE_CPU_HOTPLUG */ /* current_time() exists */ /* #undef HAVE_CURRENT_TIME */ /* Define if the GNU dcgettext() function is already present or preinstalled. */ /* #undef HAVE_DCGETTEXT */ /* DECLARE_EVENT_CLASS() is available */ /* #undef HAVE_DECLARE_EVENT_CLASS */ /* lookup_bdev() wants dev_t arg */ /* #undef HAVE_DEVT_LOOKUP_BDEV */ /* sops->dirty_inode() wants flags */ /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ /* disk_*_io_acct() available */ /* #undef HAVE_DISK_IO_ACCT */ +/* disk_update_readahead() exists */ +/* #undef HAVE_DISK_UPDATE_READAHEAD */ + /* Define to 1 if you have the header file. */ #define HAVE_DLFCN_H 1 /* d_make_root() is available */ /* #undef HAVE_D_MAKE_ROOT */ /* d_prune_aliases() is available */ /* #undef HAVE_D_PRUNE_ALIASES */ /* dops->d_revalidate() operation takes nameidata */ /* #undef HAVE_D_REVALIDATE_NAMEIDATA */ /* eops->encode_fh() wants child and parent inodes */ /* #undef HAVE_ENCODE_FH_WITH_INODE */ /* sops->evict_inode() exists */ /* #undef HAVE_EVICT_INODE */ /* fops->aio_fsync() exists */ /* #undef HAVE_FILE_AIO_FSYNC */ /* file_dentry() is available */ /* #undef HAVE_FILE_DENTRY */ /* file_inode() is available */ /* #undef HAVE_FILE_INODE */ /* iops->follow_link() cookie */ /* #undef HAVE_FOLLOW_LINK_COOKIE */ /* iops->follow_link() nameidata */ /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */ /* fops->fsync() with range */ /* #undef HAVE_FSYNC_RANGE */ /* fops->fsync() without dentry */ /* #undef HAVE_FSYNC_WITHOUT_DENTRY */ /* generic_fillattr requires struct user_namespace* */ /* #undef HAVE_GENERIC_FILLATTR_USERNS */ /* generic_*_io_acct() 3 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_3ARG */ /* generic_*_io_acct() 4 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_4ARG */ /* generic_readlink is global */ /* #undef HAVE_GENERIC_READLINK */ /* generic_setxattr() exists */ /* #undef HAVE_GENERIC_SETXATTR */ /* generic_write_checks() takes kiocb */ /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */ /* Define if the GNU gettext() function is already present or preinstalled. */ /* #undef HAVE_GETTEXT */ +/* iops->get_acl() exists */ +/* #undef HAVE_GET_ACL */ + +/* iops->get_acl() takes rcu */ +/* #undef HAVE_GET_ACL_RCU */ + /* iops->get_link() cookie */ /* #undef HAVE_GET_LINK_COOKIE */ /* iops->get_link() delayed */ /* #undef HAVE_GET_LINK_DELAYED */ /* group_info->gid exists */ /* #undef HAVE_GROUP_INFO_GID */ /* has_capability() is available */ /* #undef HAVE_HAS_CAPABILITY */ /* Define if you have the iconv() function and it works. */ #define HAVE_ICONV 1 /* yes */ /* #undef HAVE_INODE_LOCK_SHARED */ /* inode_owner_or_capable() exists */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE */ /* inode_owner_or_capable() takes user_ns */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAPPED */ /* inode_set_flags() exists */ /* #undef HAVE_INODE_SET_FLAGS */ /* inode_set_iversion() exists */ /* #undef HAVE_INODE_SET_IVERSION */ /* inode->i_*time's are timespec64 */ /* #undef HAVE_INODE_TIMESPEC64_TIMES */ /* timestamp_truncate() exists */ /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* in_compat_syscall() is available */ /* #undef HAVE_IN_COMPAT_SYSCALL */ /* iops->create() takes struct user_namespace* */ /* #undef HAVE_IOPS_CREATE_USERNS */ /* iops->mkdir() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKDIR_USERNS */ /* iops->mknod() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKNOD_USERNS */ /* iops->rename() takes struct user_namespace* */ /* #undef HAVE_IOPS_RENAME_USERNS */ /* iops->symlink() takes struct user_namespace* */ /* #undef HAVE_IOPS_SYMLINK_USERNS */ /* iov_iter_advance() is available */ /* #undef HAVE_IOV_ITER_ADVANCE */ /* iov_iter_count() is available */ /* #undef HAVE_IOV_ITER_COUNT */ /* iov_iter_fault_in_readable() is available */ /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */ /* iov_iter_revert() is available */ /* #undef HAVE_IOV_ITER_REVERT */ /* iov_iter types are available */ /* #undef HAVE_IOV_ITER_TYPES */ /* yes */ /* #undef HAVE_IO_SCHEDULE_TIMEOUT */ /* Define to 1 if you have the `issetugid' function. */ #define HAVE_ISSETUGID 1 /* kernel has kernel_fpu_* functions */ /* #undef HAVE_KERNEL_FPU */ /* kernel has asm/fpu/api.h */ /* #undef HAVE_KERNEL_FPU_API_HEADER */ /* kernel fpu internal */ /* #undef HAVE_KERNEL_FPU_INTERNAL */ /* uncached_acl_sentinel() exists */ /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */ /* kernel does stack verification */ /* #undef HAVE_KERNEL_OBJTOOL */ /* kernel has linux/objtool.h */ /* #undef HAVE_KERNEL_OBJTOOL_HEADER */ /* kernel_read() take loff_t pointer */ /* #undef HAVE_KERNEL_READ_PPOS */ /* timer_list.function gets a timer_list */ /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */ /* struct timer_list has a flags member */ /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */ /* timer_setup() is available */ /* #undef HAVE_KERNEL_TIMER_SETUP */ /* kernel_write() take loff_t pointer */ /* #undef HAVE_KERNEL_WRITE_PPOS */ /* kmem_cache_create_usercopy() exists */ /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */ /* kstrtoul() exists */ /* #undef HAVE_KSTRTOUL */ /* ktime_get_coarse_real_ts64() exists */ /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */ /* ktime_get_raw_ts64() exists */ /* #undef HAVE_KTIME_GET_RAW_TS64 */ /* kvmalloc exists */ /* #undef HAVE_KVMALLOC */ /* Define if you have [aio] */ /* #undef HAVE_LIBAIO */ /* Define if you have [blkid] */ /* #undef HAVE_LIBBLKID */ /* Define if you have [crypto] */ #define HAVE_LIBCRYPTO 1 /* Define if you have [tirpc] */ /* #undef HAVE_LIBTIRPC */ /* Define if you have [udev] */ /* #undef HAVE_LIBUDEV */ /* Define if you have [uuid] */ /* #undef HAVE_LIBUUID */ /* lseek_execute() is available */ /* #undef HAVE_LSEEK_EXECUTE */ /* makedev() is declared in sys/mkdev.h */ /* #undef HAVE_MAKEDEV_IN_MKDEV */ /* makedev() is declared in sys/sysmacros.h */ /* #undef HAVE_MAKEDEV_IN_SYSMACROS */ /* Noting that make_request_fn() returns blk_qc_t */ /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */ /* Noting that make_request_fn() returns void */ /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */ /* Define to 1 if you have the header file. */ #define HAVE_MEMORY_H 1 /* iops->mkdir() takes umode_t */ /* #undef HAVE_MKDIR_UMODE_T */ /* Define to 1 if you have the `mlockall' function. */ #define HAVE_MLOCKALL 1 /* lookup_bdev() wants mode arg */ /* #undef HAVE_MODE_LOOKUP_BDEV */ /* Define if host toolchain supports MOVBE */ #define HAVE_MOVBE 1 /* new_sync_read()/new_sync_write() are available */ /* #undef HAVE_NEW_SYNC_READ */ /* iops->getattr() takes a path */ /* #undef HAVE_PATH_IOPS_GETATTR */ /* Define if host toolchain supports PCLMULQDQ */ #define HAVE_PCLMULQDQ 1 /* percpu_counter_add_batch() is defined */ /* #undef HAVE_PERCPU_COUNTER_ADD_BATCH */ /* percpu_counter_init() wants gfp_t */ /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */ /* posix_acl_chmod() exists */ /* #undef HAVE_POSIX_ACL_CHMOD */ /* posix_acl_from_xattr() needs user_ns */ /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */ /* posix_acl_release() is available */ /* #undef HAVE_POSIX_ACL_RELEASE */ /* posix_acl_release() is GPL-only */ /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */ /* posix_acl_valid() wants user namespace */ /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */ /* proc_ops structure exists */ /* #undef HAVE_PROC_OPS_STRUCT */ /* iops->put_link() cookie */ /* #undef HAVE_PUT_LINK_COOKIE */ /* iops->put_link() delayed */ /* #undef HAVE_PUT_LINK_DELAYED */ /* iops->put_link() nameidata */ /* #undef HAVE_PUT_LINK_NAMEIDATA */ /* If available, contains the Python version number currently in use. */ #define HAVE_PYTHON "3.7" /* qat is enabled and existed */ /* #undef HAVE_QAT */ /* iops->rename() wants flags */ /* #undef HAVE_RENAME_WANTS_FLAGS */ /* REQ_DISCARD is defined */ /* #undef HAVE_REQ_DISCARD */ /* REQ_FLUSH is defined */ /* #undef HAVE_REQ_FLUSH */ /* REQ_OP_DISCARD is defined */ /* #undef HAVE_REQ_OP_DISCARD */ /* REQ_OP_FLUSH is defined */ /* #undef HAVE_REQ_OP_FLUSH */ /* REQ_OP_SECURE_ERASE is defined */ /* #undef HAVE_REQ_OP_SECURE_ERASE */ /* REQ_PREFLUSH is defined */ /* #undef HAVE_REQ_PREFLUSH */ /* revalidate_disk() is available */ /* #undef HAVE_REVALIDATE_DISK */ /* revalidate_disk_size() is available */ /* #undef HAVE_REVALIDATE_DISK_SIZE */ /* struct rw_semaphore has member activity */ /* #undef HAVE_RWSEM_ACTIVITY */ /* struct rw_semaphore has atomic_long_t member count */ /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */ /* linux/sched/signal.h exists */ /* #undef HAVE_SCHED_SIGNAL_HEADER */ /* Define to 1 if you have the header file. */ #define HAVE_SECURITY_PAM_MODULES_H 1 /* setattr_prepare() is available, doesn't accept user_namespace */ /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */ /* setattr_prepare() accepts user_namespace */ /* #undef HAVE_SETATTR_PREPARE_USERNS */ /* iops->set_acl() exists, takes 3 args */ /* #undef HAVE_SET_ACL */ /* iops->set_acl() takes 4 args */ /* #undef HAVE_SET_ACL_USERNS */ /* set_cached_acl() is usable */ /* #undef HAVE_SET_CACHED_ACL_USABLE */ /* set_special_state() exists */ /* #undef HAVE_SET_SPECIAL_STATE */ /* struct shrink_control exists */ /* #undef HAVE_SHRINK_CONTROL_STRUCT */ /* kernel_siginfo_t exists */ /* #undef HAVE_SIGINFO */ /* signal_stop() exists */ /* #undef HAVE_SIGNAL_STOP */ /* new shrinker callback wants 2 args */ /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */ /* ->count_objects exists */ /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */ #if defined(__amd64__) || defined(__i386__) /* Define if host toolchain supports SSE */ #define HAVE_SSE 1 /* Define if host toolchain supports SSE2 */ #define HAVE_SSE2 1 /* Define if host toolchain supports SSE3 */ #define HAVE_SSE3 1 /* Define if host toolchain supports SSE4.1 */ #define HAVE_SSE4_1 1 /* Define if host toolchain supports SSE4.2 */ #define HAVE_SSE4_2 1 /* Define if host toolchain supports SSSE3 */ #define HAVE_SSSE3 1 #endif /* STACK_FRAME_NON_STANDARD is defined */ /* #undef HAVE_STACK_FRAME_NON_STANDARD */ +/* standalone exists */ +/* #undef HAVE_STANDALONE_LINUX_STDARG */ + /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the `strlcat' function. */ #define HAVE_STRLCAT 1 /* Define to 1 if you have the `strlcpy' function. */ #define HAVE_STRLCPY 1 /* submit_bio is member of struct block_device_operations */ /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ /* super_setup_bdi_name() exits */ /* #undef HAVE_SUPER_SETUP_BDI_NAME */ /* super_block->s_user_ns exists */ /* #undef HAVE_SUPER_USER_NS */ /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* i_op->tmpfile() exists */ /* #undef HAVE_TMPFILE */ /* i_op->tmpfile() has userns */ /* #undef HAVE_TMPFILE_USERNS */ /* totalhigh_pages() exists */ /* #undef HAVE_TOTALHIGH_PAGES */ /* kernel has totalram_pages() */ /* #undef HAVE_TOTALRAM_PAGES_FUNC */ /* Define to 1 if you have the `udev_device_get_is_initialized' function. */ /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */ /* kernel has __kernel_fpu_* functions */ /* #undef HAVE_UNDERSCORE_KERNEL_FPU */ /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* iops->getattr() takes struct user_namespace* */ /* #undef HAVE_USERNS_IOPS_GETATTR */ /* iops->getattr() takes a vfsmount */ /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ /* aops->direct_IO() uses iovec */ /* #undef HAVE_VFS_DIRECT_IO_IOVEC */ /* aops->direct_IO() uses iov_iter without rw */ /* #undef HAVE_VFS_DIRECT_IO_ITER */ /* aops->direct_IO() uses iov_iter with offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */ /* aops->direct_IO() uses iov_iter with rw and offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */ /* All required iov_iter interfaces are available */ /* #undef HAVE_VFS_IOV_ITER */ /* fops->iterate() is available */ /* #undef HAVE_VFS_ITERATE */ /* fops->iterate_shared() is available */ /* #undef HAVE_VFS_ITERATE_SHARED */ /* fops->readdir() is available */ /* #undef HAVE_VFS_READDIR */ /* fops->read/write_iter() are available */ /* #undef HAVE_VFS_RW_ITERATE */ /* __set_page_dirty_nobuffers exists */ /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */ /* __vmalloc page flags exists */ /* #undef HAVE_VMALLOC_PAGE_KERNEL */ /* yes */ /* #undef HAVE_WAIT_ON_BIT_ACTION */ /* wait_queue_entry_t exists */ /* #undef HAVE_WAIT_QUEUE_ENTRY_T */ /* wq_head->head and wq_entry->entry exist */ /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */ /* xattr_handler->get() wants dentry */ /* #undef HAVE_XATTR_GET_DENTRY */ /* xattr_handler->get() wants both dentry and inode */ /* #undef HAVE_XATTR_GET_DENTRY_INODE */ /* xattr_handler->get() wants xattr_handler */ /* #undef HAVE_XATTR_GET_HANDLER */ /* xattr_handler has name */ /* #undef HAVE_XATTR_HANDLER_NAME */ /* xattr_handler->list() wants dentry */ /* #undef HAVE_XATTR_LIST_DENTRY */ /* xattr_handler->list() wants xattr_handler */ /* #undef HAVE_XATTR_LIST_HANDLER */ /* xattr_handler->list() wants simple */ /* #undef HAVE_XATTR_LIST_SIMPLE */ /* xattr_handler->set() wants dentry */ /* #undef HAVE_XATTR_SET_DENTRY */ /* xattr_handler->set() wants both dentry and inode */ /* #undef HAVE_XATTR_SET_DENTRY_INODE */ /* xattr_handler->set() wants xattr_handler */ /* #undef HAVE_XATTR_SET_HANDLER */ /* xattr_handler->set() takes user_namespace */ /* #undef HAVE_XATTR_SET_USERNS */ /* Define if you have [z] */ #define HAVE_ZLIB 1 /* __posix_acl_chmod() exists */ /* #undef HAVE___POSIX_ACL_CHMOD */ /* kernel exports FPU functions */ /* #undef KERNEL_EXPORTS_X86_FPU */ /* TBD: fetch(3) support */ #if 0 /* whether the chosen libfetch is to be loaded at run-time */ #define LIBFETCH_DYNAMIC 1 /* libfetch is fetch(3) */ #define LIBFETCH_IS_FETCH 1 /* libfetch is libcurl */ #define LIBFETCH_IS_LIBCURL 0 /* soname of chosen libfetch */ #define LIBFETCH_SONAME "libfetch.so.6" #endif /* Define to the sub-directory where libtool stores uninstalled libraries. */ #define LT_OBJDIR ".libs/" /* make_request_fn() return type */ /* #undef MAKE_REQUEST_FN_RET */ /* hardened module_param_call */ /* #undef MODULE_PARAM_CALL_CONST */ /* struct shrink_control has nid */ /* #undef SHRINK_CONTROL_HAS_NID */ /* Defined for legacy compatibility. */ #define SPL_META_ALIAS ZFS_META_ALIAS /* Defined for legacy compatibility. */ #define SPL_META_RELEASE ZFS_META_RELEASE /* Defined for legacy compatibility. */ #define SPL_META_VERSION ZFS_META_VERSION /* True if ZFS is to be compiled for a FreeBSD system */ #define SYSTEM_FREEBSD 1 /* True if ZFS is to be compiled for a Linux system */ /* #undef SYSTEM_LINUX */ /* zfs debugging enabled */ /* #undef ZFS_DEBUG */ /* /dev/zfs minor */ /* #undef ZFS_DEVICE_MINOR */ /* enum node_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */ /* enum node_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum node_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */ /* enum zone_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */ /* enum zone_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum zone_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */ /* global_node_page_state() exists */ /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */ /* global_zone_page_state() exists */ /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */ /* Define to 1 if GPL-only symbols can be used */ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_g4a1195ca5" +#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gec64fdb93" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" /* Define the project release date. */ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "5.13" +#define ZFS_META_KVER_MAX "5.14" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" /* Define the project license. */ #define ZFS_META_LICENSE "CDDL" /* Define the libtool library 'age' version information. */ /* #undef ZFS_META_LT_AGE */ /* Define the libtool library 'current' version information. */ /* #undef ZFS_META_LT_CURRENT */ /* Define the libtool library 'revision' version information. */ /* #undef ZFS_META_LT_REVISION */ /* Define the project name. */ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_g4a1195ca5" +#define ZFS_META_RELEASE "FreeBSD_gec64fdb93" /* Define the project version. */ #define ZFS_META_VERSION "2.1.99" /* count is located in percpu_ref.data */ /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */ diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 1af08380dca4..00aa4282082e 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1,5 +1,5 @@ /* * $FreeBSD$ */ -#define ZFS_META_GITREV "zfs-2.1.99-453-g4a1195ca50" +#define ZFS_META_GITREV "zfs-2.1.99-485-gec64fdb93"